From f7259eb59f2fa713a0ce5b5142f7693a78cfe517 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Thu, 26 Oct 2023 13:16:15 +1000 Subject: [PATCH] chore: extend optional suffix matching to prefer tags that most closely match the version parts Signed-off-by: Ben Selwyn-Smith --- src/macaron/repo_finder/commit_finder.py | 79 +++++++++++++++++++----- tests/repo_finder/test_repo_finder.py | 45 ++++++-------- tests/slsa_analyzer/mock_git_utils.py | 18 +++--- 3 files changed, 95 insertions(+), 47 deletions(-) diff --git a/src/macaron/repo_finder/commit_finder.py b/src/macaron/repo_finder/commit_finder.py index fc0431807..33ea87878 100644 --- a/src/macaron/repo_finder/commit_finder.py +++ b/src/macaron/repo_finder/commit_finder.py @@ -15,11 +15,13 @@ logger: logging.Logger = logging.getLogger(__name__) ALPHANUMERIC = "0-9a-z" -PREFIX = "(?:.*)" +# Any number of irrelevant prefix characters followed by 1 to 2 prefix-version separators (optional). +PREFIX = "(?:.*[-_/vr]{1,2})?" # Version token separator: 1 to 3 alphabetic characters OR 1 to 3 non-alphanumeric characters. INFIX = "([a-z]{1,3}|[^0-9a-z]{1,3})" split_pattern = re.compile(f"[^{ALPHANUMERIC}]+", flags=re.IGNORECASE) validation_pattern = re.compile(f"[{ALPHANUMERIC}]+", flags=re.IGNORECASE) +alphabetic_pattern = re.compile(".*[a-z].*", flags=re.IGNORECASE) def get_commit_from_version(git_obj: Git, purl: PackageURL) -> tuple[str, str]: @@ -44,8 +46,15 @@ def get_commit_from_version(git_obj: Git, purl: PackageURL) -> tuple[str, str]: return "", "" logger.debug("Searching for commit of artifact version using tags: %s@%s", purl.name, purl.version) - target_version_pattern = _build_version_pattern(purl.version) - has_name_pattern = re.compile(f".*{re.escape(purl.name)}.*[0-9].*", flags=re.IGNORECASE) + target_version_pattern, parts = _build_version_pattern(purl.version) + # The has_name_pattern tries to determine whether a given tag contains the name of the current artifact as a prefix + # and at least one number that could belong to a version. + # .* : Any number of irrelevant pre-prefix characters. + # purl.name : The artifact name (escaped). + # [^a-z0-9]{1,3} : 1 to 3 non-numeric characters. + # [0-9] : A single numeric character. + # .* : The remainder of the version and possible suffixes. + has_name_pattern = re.compile(f".*{re.escape(purl.name)}[^{ALPHANUMERIC}]" + "{1,3}[0-9].*", flags=re.IGNORECASE) # Tags are examined as followed: # - Any without a corresponding commit are discarded. @@ -70,9 +79,15 @@ def get_commit_from_version(git_obj: Git, purl: PackageURL) -> tuple[str, str]: # Match tags. if named_tags: - matched_tags = _match_tags(named_tags, target_version_pattern) + matched_tags = _match_tags(named_tags, target_version_pattern, parts) + if not matched_tags: + # See if there are any possible matches from non-named tags. + possible_matches = _match_tags(other_tags, target_version_pattern, parts) + if possible_matches: + # TODO Determine whether to accept a possible match. + logger.debug("Possible tag matches: %s", possible_matches) else: - matched_tags = _match_tags(other_tags, target_version_pattern) + matched_tags = _match_tags(other_tags, target_version_pattern, parts) # Report matched tags. if not matched_tags: @@ -116,7 +131,7 @@ def get_commit_from_version(git_obj: Git, purl: PackageURL) -> tuple[str, str]: return "", "" -def _build_version_pattern(version: str) -> Pattern: +def _build_version_pattern(version: str) -> tuple[Pattern, list[str]]: """Build a version pattern to match the passed version string. Parameters @@ -126,8 +141,8 @@ def _build_version_pattern(version: str) -> Pattern: Returns ------- - Pattern - The regex pattern that will match the version. + tuple[Pattern, list[str]] + The tuple of the regex pattern that will match the version, and the list of version parts that were extracted. """ # The version is split on non-alphanumeric characters to separate the version parts from the non-version parts. @@ -139,24 +154,32 @@ def _build_version_pattern(version: str) -> Pattern: split = [version] this_version_pattern = "" - for part in split: + parts = [] + for count, part in enumerate(split): # Validate the split part by checking it is only comprised of alphanumeric characters. valid = validation_pattern.match(part) if not valid: continue + parts.append(part) + # If the final version part has at least one alphabetic character in it, consider it to be optional. + # This requires wrapping it and the previous INFIX in brackets. + has_alphabetic_suffix = count == len(split) - 1 and alphabetic_pattern.match(str(part)) + if has_alphabetic_suffix: + this_version_pattern = this_version_pattern + "(" if this_version_pattern: - # Between one and three non-numeric characters are accepted between the version parts. - # This balances the tradeoff between maximal matching and minimal false positives. + # INFIX matches separators as described where it is instantiated. this_version_pattern = this_version_pattern + INFIX this_version_pattern = this_version_pattern + str(part) + if has_alphabetic_suffix: + this_version_pattern = this_version_pattern + ")?" # Prepend the optional prefix, add a named capture group for the version, and enforce end of string analysis. this_version_pattern = PREFIX + "(?P" + this_version_pattern + ")" + "$" logger.debug("Created pattern: %s", this_version_pattern) - return re.compile(this_version_pattern, flags=re.IGNORECASE) + return re.compile(this_version_pattern, flags=re.IGNORECASE), parts -def _match_tags(tag_list: list[TagReference], pattern: Pattern) -> list[TagReference]: +def _match_tags(tag_list: list[TagReference], pattern: Pattern, parts: list[str]) -> list[TagReference]: """Return items of the passed tag list that match the passed pattern. Parameters @@ -165,10 +188,13 @@ def _match_tags(tag_list: list[TagReference], pattern: Pattern) -> list[TagRefer The list of tags to check. pattern: Pattern The pattern to match against. + parts: list[str] + The list of version parts extracted from the version. Returns ------- - The list of tags that matched the pattern. + list[TagReference] + The list of tags that matched the pattern. """ matched_tags = [] for tag in tag_list: @@ -177,4 +203,29 @@ def _match_tags(tag_list: list[TagReference], pattern: Pattern) -> list[TagRefer if not match: continue matched_tags.append(tag) + if len(matched_tags) > 1: + # Sort the matched tags so that tags that contain all the version parts are prioritised. + matched_tags.sort(key=lambda item: _count_parts_in_tag(str(item), parts)) return matched_tags + + +def _count_parts_in_tag(tag: str, parts: list[str]) -> int: + """Count how many version part strings are contained in the tag. + + Parameters + ---------- + tag: str + The tag to be checked. + parts: list[str] + The version parts extracted from the version string. + + Returns + ------- + int + The count of contained parts subtracted from the number of parts. + """ + count = len(parts) + for part in parts: + if part in tag: + count = count - 1 + return count diff --git a/tests/repo_finder/test_repo_finder.py b/tests/repo_finder/test_repo_finder.py index de3b309f7..bf328816c 100644 --- a/tests/repo_finder/test_repo_finder.py +++ b/tests/repo_finder/test_repo_finder.py @@ -3,6 +3,7 @@ """This module tests the repo finder.""" import os +import shutil from pathlib import Path import pytest @@ -12,7 +13,7 @@ from macaron.config.target_config import Configuration from macaron.repo_finder import commit_finder from macaron.slsa_analyzer.analyzer import Analyzer -from tests.slsa_analyzer.mock_git_utils import add_tag_if_not_present, commit_nothing, initiate_repo +from tests.slsa_analyzer.mock_git_utils import add_new_commit_with_tag, initiate_repo @pytest.mark.parametrize( @@ -79,39 +80,33 @@ def test_resolve_analysis_target( def test_get_commit_from_version() -> None: """Test resolving commits from version tags.""" path = Path(__file__).parent.joinpath("mock_repo") - init_repo = not os.path.exists(path) + if os.path.exists(path): + shutil.rmtree(path) git_obj: Git = initiate_repo(path) - if init_repo: - tags = [ - "test-name-v1.0.1-A", - "v1.0.1-B", - "v1.0.3+test", - "v_1.0.5", - "50_0_2", - "r78rv109", - ] - # Add a commit for each tag with a message that can be verified later. - for count, value in enumerate(tags): - commit_nothing(git_obj, str(count)) - add_tag_if_not_present(git_obj, value) + + tags = ["test-name-v1.0.1-A", "v1.0.3+test", "v_1.0.5", "50_0_2", "r78rv109", "1.0.5-JRE"] + # Add a commit for each tag that can be verified later. + hash_targets = [] + for tag in tags: + hash_targets.append(add_new_commit_with_tag(git_obj, tag)) # Perform tests versions = [ - "1.0.1-A", - "1.0.1-B", - "1.0.3+test", - "1.0.5", - "50.0.2", - "78.109", + "1.0.1-A", # To match a tag with a named suffix. + "1.0.3+test", # To match a tag with a '+' suffix. + "1.0.5", # To match a tag with a 'v_' prefix. + "50.0.2", # To match a tag separated by '_'. + "78.109", # To match a tag separated by characters 'r' 'rv'. + "1.0.5-JRE", # To NOT match the similar tag without the 'JRE' suffix. ] purl_name = "test-name" for count, value in enumerate(versions): - _test_version(git_obj, PackageURL(type="maven", name=purl_name, version=value), str(count)) + _test_version(git_obj, PackageURL(type="maven", name=purl_name, version=value), hash_targets[count]) purl_name = "test-name" + "-" + str(count + 1) -def _test_version(git_obj: Git, purl: PackageURL, commit_message: str) -> None: - """Retrieve commit matching version and check commit message is correct.""" +def _test_version(git_obj: Git, purl: PackageURL, hash_target: str) -> None: + """Retrieve commit matching version and check commit hash is correct.""" branch, digest = commit_finder.get_commit_from_version(git_obj, purl) assert branch - assert git_obj.get_commit(digest).msg == commit_message + assert git_obj.get_commit(digest).hash == hash_target diff --git a/tests/slsa_analyzer/mock_git_utils.py b/tests/slsa_analyzer/mock_git_utils.py index 8af975f56..61a495ce5 100644 --- a/tests/slsa_analyzer/mock_git_utils.py +++ b/tests/slsa_analyzer/mock_git_utils.py @@ -67,7 +67,7 @@ def commit_files(git_wrapper: Git, file_names: list) -> bool: return False -def commit_nothing(git_wrapper: Git, message: str = "") -> bool: +def commit_nothing(git_wrapper: Git, message: str = "") -> str: """Create an empty commit in the repository indicated by the git_wrapper. Parameters @@ -79,18 +79,18 @@ def commit_nothing(git_wrapper: Git, message: str = "") -> bool: Returns ------- - bool - True if succeed else False. + str + The commit sha or an empty string if unsuccessful. """ try: # Store the index object as recommended by the documentation current_index = git_wrapper.repo.index if not message: message = "Empty commit" - current_index.commit(message) - return True + commit = current_index.commit(message) + return str(commit.hexsha) except GitError: - return False + return "" def prepare_repo_for_testing( @@ -137,7 +137,7 @@ def prepare_repo_for_testing( return analyze_ctx -def add_tag_if_not_present(git_obj: Git, tag: str) -> None: +def add_new_commit_with_tag(git_obj: Git, tag: str) -> str: """Add passed tag to repository if not already present. Parameters @@ -148,5 +148,7 @@ def add_tag_if_not_present(git_obj: Git, tag: str) -> None: The tag to possibly add. """ if tag in git_obj.repo.tags: - return + return "" + sha = commit_nothing(git_obj) git_obj.repo.create_tag(tag) + return sha