Skip to content

Commit

Permalink
perf: use partial clone to reduce clone time (#389)
Browse files Browse the repository at this point in the history
Signed-off-by: Nathan Nguyen <[email protected]>
Signed-off-by: Trong Nhan Mai <[email protected]>
Co-authored-by: Trong Nhan Mai <[email protected]>
  • Loading branch information
nathanwn and tromai authored Nov 3, 2023
1 parent 37a96bb commit ec4e190
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 12 deletions.
8 changes: 8 additions & 0 deletions docs/source/pages/developers_guide/apidoc/macaron.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ Subpackages
Submodules
----------

macaron.environment\_variables module
-------------------------------------

.. automodule:: macaron.environment_variables
:members:
:undoc-members:
:show-inheritance:

macaron.errors module
---------------------

Expand Down
45 changes: 45 additions & 0 deletions src/macaron/environment_variables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""Helper functions related to environment variables."""

import os
from collections.abc import Mapping


def get_patched_env(
patch: Mapping[str, str | None],
_env: dict[str, str] | None = None,
) -> dict[str, str]:
"""Return a dictionary whose elements copied from ``os.environ`` and are updated according to ``patch``.
This function does not modify ``os.environ``.
Parameters
----------
patch : Mapping[str, str | None]
A mapping (immutable) in which:
- each key is an environment variable.
- each value is the value to set to the corresponding environment variable.
If value is ``None``, the environment variable is "unset".
_env : dict[str, str] | None
The environment being updated.
This is ``None`` by default, in which case ``os.environ`` is being updated.
Returns
-------
dict[str, str]
The the dictionary contains the patched env variables.
"""
env = os.environ if _env is None else _env

# Make a copy of the environment.
copied_env = dict(env)

for var, value in patch.items():
if value is None:
copied_env.pop(var, None)
else:
copied_env[var] = value

return copied_env
51 changes: 39 additions & 12 deletions src/macaron/slsa_analyzer/git_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,18 @@
import os
import re
import string
import subprocess # nosec B404
import urllib.parse
from configparser import ConfigParser
from pathlib import Path

from git import GitCommandError
from git.objects import Commit
from git.repo import Repo
from pydriller.git import Git

from macaron.config.defaults import defaults
from macaron.environment_variables import get_patched_env
from macaron.errors import CloneError

logger: logging.Logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -235,6 +238,12 @@ def clone_remote_repo(clone_dir: str, url: str) -> Repo | None:
This could happen when multiple runs of Macaron use the same `<output_dir>`, leading
to Macaron potentially trying to clone a repository multiple times.
We use treeless partial clone to reduce clone time, by retrieving trees and blobs lazily.
For more details, see the following:
- https://git-scm.com/docs/partial-clone
- https://git-scm.com/docs/git-rev-list
- https://github.blog/2020-12-21-get-up-to-speed-with-partial-clone-and-shallow-clone
Parameters
----------
clone_dir : str
Expand Down Expand Up @@ -268,20 +277,38 @@ def clone_remote_repo(clone_dir: str, url: str) -> Repo | None:
)
return None

# Ensure that the parent directory where the repo is cloned into exists.
parent_dir = Path(clone_dir).parent
parent_dir.mkdir(parents=True, exist_ok=True)

try:
# The Repo.clone_from method handles creating intermediate dirs.
return Repo.clone_from(
url=url,
to_path=clone_dir,
env={
# Setting the GIT_TERMINAL_PROMPT environment variable to ``0`` stops
# ``git clone`` from prompting for login credentials.
"GIT_TERMINAL_PROMPT": "0",
},
git_env_patch = {
# Setting the GIT_TERMINAL_PROMPT environment variable to ``0`` stops
# ``git clone`` from prompting for login credentials.
"GIT_TERMINAL_PROMPT": "0",
}
result = subprocess.run( # nosec B603
args=["git", "clone", "--filter=tree:0", url],
capture_output=True,
cwd=parent_dir,
# If `check=True` and return status code is not zero, subprocess.CalledProcessError is
# raised, which we don't want. We want to check the return status code of the subprocess
# later on.
check=False,
env=get_patched_env(git_env_patch),
)
except GitCommandError as error:
# stderr here does not contain secrets, so it is safe for logging.
raise CloneError(error.stderr) from None
except (subprocess.CalledProcessError, OSError):
# Here, we raise from ``None`` to be extra-safe that no token is leaked.
# We should never store or print out the captured output from the subprocess
# because they might contain the secret-embedded URL.
raise CloneError("Failed to clone repository.") from None

if result.returncode != 0:
raise CloneError(
"Failed to clone repository: the `git clone --filter=tree:0` command exited with non-zero return code."
)

return Repo(path=clone_dir)


def get_repo_name_from_url(url: str) -> str:
Expand Down
51 changes: 51 additions & 0 deletions tests/test_environment_variables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""Tests for helper functions related to environment variables."""

import pytest

from macaron.environment_variables import get_patched_env


@pytest.mark.parametrize(
("before", "patch", "expect"),
[
pytest.param(
{"FOO": "some-value"},
{},
{"FOO": "some-value"},
id="patch is empty",
),
pytest.param(
{"FOO": "some-value"},
{"GIT_TERMINAL_PROMPT": "0"},
{
"FOO": "some-value",
"GIT_TERMINAL_PROMPT": "0",
},
id="patch adding a variable",
),
pytest.param(
{"GIT_TERMINAL_PROMPT": "1"},
{"GIT_TERMINAL_PROMPT": "0"},
{"GIT_TERMINAL_PROMPT": "0"},
id="patch overriding a variable",
),
pytest.param(
{"GIT_TERMINAL_PROMPT": "0"},
{"GIT_TERMINAL_PROMPT": None},
{},
id="patch removing a variable",
),
],
)
def test_patched_env(
before: dict[str, str],
patch: dict[str, str | None],
expect: dict[str, str],
) -> None:
"""Tests for the ``get_patched_env`` helper function."""
env = dict(before)

assert get_patched_env(patch, env) == expect

0 comments on commit ec4e190

Please sign in to comment.