Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: update gitlab repo remote url #354

Merged
merged 9 commits into from
Jul 16, 2023
4 changes: 4 additions & 0 deletions src/macaron/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,7 @@ class ConfigurationError(MacaronError):

class CloneError(MacaronError):
"""Happens when cannot clone a git repository."""


class RepoError(MacaronError):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The relation between a RepoError and CloneError is not clear and they seem to have an overlap. RepoError seems to be related to checking out a repo, so I wonder if it should be renamed to something like CheckOutError? Or we could add a RepoError as a parent class of CloneError and CheckOutError.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will rename RepoError to RepoCheckOutError and update the doc string accordingly for clarity.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 38db651

"""Happens when there is an error when preparing the repository for the analysis."""
30 changes: 27 additions & 3 deletions src/macaron/slsa_analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
NoneDependencyAnalyzer,
)
from macaron.dependency_analyzer.cyclonedx import get_deps_from_sbom
from macaron.errors import CloneError
from macaron.errors import CloneError, RepoError
from macaron.output_reporter.reporter import FileReporter
from macaron.output_reporter.results import Record, Report, SCMStatus
from macaron.slsa_analyzer import git_url
Expand Down Expand Up @@ -504,6 +504,7 @@ def _prepare_repo(
Git
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The type here should be Git | None. Can you please fix it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 204d5f8.

The pydriller.Git object of the repository or None if error.
"""
# TODO: separate the logic for handling remote and local repos instead of putting them into this method.
behnazh-w marked this conversation as resolved.
Show resolved Hide resolved
# Cannot specify a commit hash without specifying the branch.
if not branch_name and digest:
logger.error(
Expand Down Expand Up @@ -560,8 +561,31 @@ def _prepare_repo(
logger.error("Cannot reset the target repository.")
return None

if not git_url.check_out_repo_target(git_obj, branch_name, digest, (not is_remote)):
logger.error("Cannot checkout the specific branch or commit of the target repo.")
# Checking out the specific branch or commit. This operation varies depends on the git service that the
# repository uses.
if not is_remote:
# If the repo path provided by the user is a local path, we need to get the actual origin remote URL of
# the repo to decide on the suitable git service.
origin_remote_url = git_url.get_remote_origin_of_local_repo(git_obj)
if git_url.is_remote_repo(origin_remote_url):
# The local repo's origin remote url is a remote URL (e.g https://host.com/a/b): In this case, we obtain
# the corresponding git service using ``self.get_git_service``.
git_service = self.get_git_service(origin_remote_url)
else:
# The local repo's origin remote url is a local path (e.g /path/to/local/...). This happens when the
# target repository is a clone from another local repo or is a clone from a git archive -
# https://git-scm.com/docs/git-archive: In this case, we fall-back to the generic function
# ``git_url.check_out_repo_target``.
if not git_url.check_out_repo_target(git_obj, branch_name, digest, not is_remote):
logger.error("Cannot checkout the specific branch or commit of the target repo.")
return None

return git_obj

try:
git_service.check_out_repo(git_obj, branch_name, digest, not is_remote)
except RepoError as error:
logger.error(error)
return None

return git_obj
Expand Down
46 changes: 45 additions & 1 deletion src/macaron/slsa_analyzer/git_service/base_git_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@

from abc import abstractmethod

from pydriller.git import Git

from macaron.config.defaults import defaults
from macaron.errors import CloneError, ConfigurationError
from macaron.errors import CloneError, ConfigurationError, RepoError
from macaron.slsa_analyzer import git_url


Expand Down Expand Up @@ -112,6 +114,33 @@ def clone_repo(self, clone_dir: str, url: str) -> None:
"""
raise NotImplementedError()

@abstractmethod
def check_out_repo(self, git_obj: Git, branch: str, digest: str, offline_mode: bool) -> Git:
"""Checkout the branch and commit specified by the user of a repository.

Parameters
----------
git_obj : Git
The Git object for the repository to check out.
branch : str
The branch to check out.
digest : str
The sha of the commit to check out.
offline_mode: bool
If true, no fetching is performed.

Returns
-------
Git
The same Git object from the input.

Raises
------
RepoError
If there is an error while checking out the specific branch or commit.
"""
raise NotImplementedError()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no need to raise the NotImplementedError. The abc module handles unimplemented abstract methods.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 73cd1bc



class NoneGitService(BaseGitService):
"""This class can be used to initialize an empty git service."""
Expand Down Expand Up @@ -154,3 +183,18 @@ def clone_repo(self, _clone_dir: str, url: str) -> None:
Always raise, since this method should not be used to clone any repository.
"""
raise CloneError(f"Internal error encountered when cloning the repo '{url}'.")

def check_out_repo(self, git_obj: Git, branch: str, digest: str, offline_mode: bool) -> Git:
"""Checkout the branch and commit specified by the user of a repository.

In this particular case, since this class represents a ``None`` git service,
we do nothing but raise a ``RepoError``.

Raises
------
RepoError
Always raise, since this method should not be used to check out in any repository.
"""
raise RepoError(
f"Internal error when checking out branch {branch} and commit {digest} for repo {git_obj.project_name}."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of Internal error can you please be more specific. For example here the error should communicate that we cannot check out from an empty git service.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 77e4d14.

)
7 changes: 7 additions & 0 deletions src/macaron/slsa_analyzer/git_service/bitbucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@

import logging

from pydriller.git import Git

from macaron.errors import RepoError
from macaron.slsa_analyzer.git_service.base_git_service import BaseGitService

logger: logging.Logger = logging.getLogger(__name__)
Expand All @@ -26,3 +29,7 @@ def clone_repo(self, _clone_dir: str, _url: str) -> None:
"""Clone a BitBucket repo."""
# TODO: implement this once support for BitBucket is added.
logger.info("Cloning BitBucket repositories is not supported yet. Please clone the repository manually.")

def check_out_repo(self, git_obj: Git, branch: str, digest: str, offline_mode: bool) -> Git:
"""Checkout the branch and commit specified by the user of a repository."""
raise RepoError("Checking out a branch or commit on a Bitbucket repository is not supported yet.")
35 changes: 34 additions & 1 deletion src/macaron/slsa_analyzer/git_service/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@

"""This module contains the spec for the GitHub service."""

from pydriller.git import Git

from macaron.config.global_config import global_config
from macaron.errors import ConfigurationError
from macaron.errors import ConfigurationError, RepoError
from macaron.slsa_analyzer import git_url
from macaron.slsa_analyzer.git_service.api_client import GhAPIClient, get_default_gh_client
from macaron.slsa_analyzer.git_service.base_git_service import BaseGitService
Expand Down Expand Up @@ -56,3 +58,34 @@ def clone_repo(self, clone_dir: str, url: str) -> None:
If there is an error cloning the repo.
"""
git_url.clone_remote_repo(clone_dir, url)

def check_out_repo(self, git_obj: Git, branch: str, digest: str, offline_mode: bool) -> Git:
"""Checkout the branch and commit specified by the user of a repository.

Parameters
----------
git_obj : Git
The Git object for the repository to check out.
branch : str
The branch to check out.
digest : str
The sha of the commit to check out.
offline_mode: bool
If true, no fetching is performed.

Returns
-------
Git
The same Git object from the input.

Raises
------
RepoError
If there is error while checkout the specific branch and digest.
"""
if not git_url.check_out_repo_target(git_obj, branch, digest, offline_mode):
raise RepoError(
behnazh-w marked this conversation as resolved.
Show resolved Hide resolved
f"Internal error when checking out branch {branch} and commit {digest} for repo {git_obj.project_name}."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
f"Internal error when checking out branch {branch} and commit {digest} for repo {git_obj.project_name}."
f"Failed to check out branch {branch} and commit {digest} of repo {git_obj.project_name}."

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 77e4d14

)

return git_obj
77 changes: 75 additions & 2 deletions src/macaron/slsa_analyzer/git_service/gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
from abc import abstractmethod
from urllib.parse import ParseResult, urlunparse

from macaron.errors import CloneError, ConfigurationError
from pydriller.git import Git

from macaron.errors import CloneError, ConfigurationError, RepoError
from macaron.slsa_analyzer import git_url
from macaron.slsa_analyzer.git_service.base_git_service import BaseGitService

Expand Down Expand Up @@ -103,6 +105,10 @@ def clone_repo(self, clone_dir: str, url: str) -> None:
To clone a GitLab repository with access token, we embed the access token in the https URL.
See GitLab documentation: https://docs.gitlab.com/ee/gitlab-basics/start-using-git.html#clone-using-a-token.

If we clone using the https URL with the token embedded, this URL will be store as plain text in .git/config as
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
If we clone using the https URL with the token embedded, this URL will be store as plain text in .git/config as
If we clone using the https URL with the token embedded, this URL will be stored as plain text in .git/config as

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 6678fe2

the origin remote URL. Therefore, after a repository is cloned, this remote origin URL will be set
with the value of the original ``url`` (which does not have the embed token).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
with the value of the original ``url`` (which does not have the embed token).
with the value of the original ``url`` (which does not have the embeded token).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 6678fe2


Parameters
----------
clone_dir: str
Expand All @@ -117,7 +123,74 @@ def clone_repo(self, clone_dir: str, url: str) -> None:
If there is an error cloning the repository.
"""
clone_url = self.construct_clone_url(url)
git_url.clone_remote_repo(clone_dir, clone_url)
repo = git_url.clone_remote_repo(clone_dir, clone_url)
behnazh-w marked this conversation as resolved.
Show resolved Hide resolved

# If ``git_url.clone_remote_repo`` return an Repo instance, this means that the repository is freshly cloned
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# If ``git_url.clone_remote_repo`` return an Repo instance, this means that the repository is freshly cloned
# If ``git_url.clone_remote_repo`` returns a Repo instance,

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 6678fe2

# with the token embedded URL. We will set its value back to the original non-token URL.
# If ``git_url.clone_remote_repo`` returns None, it means that the repository already exists so we don't need
# to do anything.
if repo:
try:
origin_remote = repo.remote("origin")
except ValueError as error:
raise CloneError("Cannot find the remote origin for this repository.") from error

origin_remote.set_url(url)

def check_out_repo(self, git_obj: Git, branch: str, digest: str, offline_mode: bool) -> Git:
"""Checkout the branch and commit specified by the user of a repository.

For GitLab, this method set the origin remote URL of the target repository to the token-embedded URL if
a token is available before performing the checkout operation.

After the checkout operation finishes, the origin remote URL is set back again to ensure that no token-embedded
URL remains.

Parameters
----------
git_obj : Git
The Git object for the repository to check out.
branch : str
The branch to check out.
digest : str
The sha of the commit to check out.
offline_mode: bool
If true, no fetching is performed.

Returns
-------
Git
The same Git object from the input.

Raises
------
RepoError
If there is error while checkout the specific branch and digest.
"""
remote_origin_url = git_url.get_remote_origin_of_local_repo(git_obj)

try:
origin_remote = git_obj.repo.remote("origin")
except ValueError as error:
raise RepoError("Cannot find the remote origin for this repository.") from error

try:
reconstructed_url = self.construct_clone_url(remote_origin_url)
except CloneError as error:
raise RepoError("Internal error prevent preparing the repo for the analysis.") from error
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment about being more specific about the error and avoiding Internal error.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 77e4d14


origin_remote.set_url(reconstructed_url, remote_origin_url)

check_out_status = git_url.check_out_repo_target(git_obj, branch, digest, offline_mode)

origin_remote.set_url(remote_origin_url, reconstructed_url)

if not check_out_status:
raise RepoError(
f"Internal error when checking out branch {branch} and commit {digest} for repo {git_obj.project_name}."
behnazh-w marked this conversation as resolved.
Show resolved Hide resolved
)

return git_obj


class SelfHostedGitLab(GitLab):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a test file.
95 changes: 95 additions & 0 deletions tests/slsa_analyzer/git_service/test_gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@
from unittest import mock

import pytest
from pydriller.git import Git

from macaron.config.defaults import load_defaults
from macaron.errors import ConfigurationError
from macaron.slsa_analyzer import git_url
from macaron.slsa_analyzer.git_service.gitlab import PubliclyHostedGitLab, SelfHostedGitLab
from tests.slsa_analyzer.mock_git_utils import commit_files, initiate_repo


@pytest.mark.parametrize(
Expand Down Expand Up @@ -111,3 +114,95 @@ def test_self_hosted_gitlab_without_env_set(tmp_path: Path) -> None:

with pytest.raises(ConfigurationError):
gitlab.load_defaults()


@pytest.fixture(name="self_hosted_gitlab")
def self_hosted_gitlab_repo_fixture(request: pytest.FixtureRequest) -> Git:
"""Create a mock GitLab self_hosted repo.

This fixture expects ONE parameter of type STR which will be the origin remote url initialized for the self_hosted
GitLab repo. To see how we could utilize this with pytest.mark.parameterize, see:
https://docs.pytest.org/en/7.1.x/example/parametrize.html?highlight=indirect#indirect-parametrization
https://docs.pytest.org/en/7.1.x/example/parametrize.html?highlight=indirect#apply-indirect-on-particular-arguments

Returns
-------
Git
The pydriller.git.Git wrapper object for the self_hosted GitLab repo.
"""
test_repo_path = Path(__file__).parent.joinpath("resources", "self_hosted_gitlab_repo")

gitlab_repo = initiate_repo(test_repo_path)

# Commit untracked test files in the mock repo.
# This would only happen the first time this test case is run.
if gitlab_repo.repo.untracked_files:
commit_files(gitlab_repo, gitlab_repo.repo.untracked_files)

# Assigning the origin remote url to the passed in parameter.
remote_url = request.param

# For newly init git repositories, the origin remote does not exist. Therefore, we create one and assign it with
# ``remote_url``.
# If the origin remote already exist we reset its URL to be the ``remote_url``.
if "origin" not in gitlab_repo.repo.remotes:
remote_name = "origin"
gitlab_repo.repo.create_remote(remote_name, remote_url)
else:
gitlab_repo.repo.remote("origin").set_url(remote_url)

yield gitlab_repo

gitlab_repo.clear()


# The indirect parameter is used to note that ``self_hosted_gitlab`` should be passed to the ``self_hosted_gitlab``
# fixture first. The test function would then receive the final fixture value.
# As for ``expected_origin_url``, this parameter is passed directly into the test function because it's not listed in
# ``indirect``. Reference:
# https://docs.pytest.org/en/7.1.x/example/parametrize.html?highlight=indirect#apply-indirect-on-particular-arguments
@pytest.mark.parametrize(
("self_hosted_gitlab", "expected_origin_url"),
[
("https://oauth2:[email protected]/a/b", "https://internal.gitlab.org/a/b"),
("https://oauth2:[email protected]/a/b.git", "https://internal.gitlab.org/a/b"),
],
indirect=["self_hosted_gitlab"],
)
def test_origin_remote_url_masking(self_hosted_gitlab: Git, expected_origin_url: str, tmp_path: Path) -> None:
"""Test if the ``clone_repo`` and ``check_out_repo`` methods handle masking the token in the clone URL correctly.

Note that this test ONLY checks if the remote origin URL after the clone/checkout operations is updated back to
the token-less URL.
It does not check whether those operation works correctly.
"""
user_config_input = """
[git_service.gitlab.self_hosted]
domain = internal.gitlab.org
"""
user_config_path = os.path.join(tmp_path, "config.ini")
with open(user_config_path, "w", encoding="utf-8") as user_config_file:
user_config_file.write(user_config_input)

# We don't have to worry about modifying the ``defaults`` object causing test
# pollution here, since we reload the ``defaults`` object before every test with the
# ``setup_test`` fixture.
load_defaults(user_config_path)

with mock.patch.dict(os.environ, {"MCN_SELF_HOSTED_GITLAB_TOKEN": "abcxyz"}):
gitlab = SelfHostedGitLab()
gitlab.load_defaults()

behnazh-w marked this conversation as resolved.
Show resolved Hide resolved
with mock.patch("macaron.slsa_analyzer.git_url.clone_remote_repo", return_value=self_hosted_gitlab.repo):
# We check the origin remote URL after cloning is as expected.
gitlab.clone_repo(str(tmp_path), expected_origin_url)
assert git_url.get_remote_origin_of_local_repo(self_hosted_gitlab) == expected_origin_url

with mock.patch("macaron.slsa_analyzer.git_url.check_out_repo_target", return_value=self_hosted_gitlab.repo):
# We check that after checking out the latest commit in the default branch, the origin remote
# URL is as expected.
gitlab.check_out_repo(self_hosted_gitlab, "", "", True)
assert git_url.get_remote_origin_of_local_repo(self_hosted_gitlab) == expected_origin_url

gitlab.check_out_repo(self_hosted_gitlab, "", "", False)
assert git_url.get_remote_origin_of_local_repo(self_hosted_gitlab) == expected_origin_url
Loading