Skip to content

Commit

Permalink
chore: extend optional suffix matching to prefer tags that most close…
Browse files Browse the repository at this point in the history
…ly match the version parts

Signed-off-by: Ben Selwyn-Smith <[email protected]>
  • Loading branch information
benmss committed Oct 26, 2023
1 parent f1a86af commit f7259eb
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 47 deletions.
79 changes: 65 additions & 14 deletions src/macaron/repo_finder/commit_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
logger: logging.Logger = logging.getLogger(__name__)

ALPHANUMERIC = "0-9a-z"
PREFIX = "(?:.*)"
# Any number of irrelevant prefix characters followed by 1 to 2 prefix-version separators (optional).
PREFIX = "(?:.*[-_/vr]{1,2})?"
# Version token separator: 1 to 3 alphabetic characters OR 1 to 3 non-alphanumeric characters.
INFIX = "([a-z]{1,3}|[^0-9a-z]{1,3})"
split_pattern = re.compile(f"[^{ALPHANUMERIC}]+", flags=re.IGNORECASE)
validation_pattern = re.compile(f"[{ALPHANUMERIC}]+", flags=re.IGNORECASE)
alphabetic_pattern = re.compile(".*[a-z].*", flags=re.IGNORECASE)


def get_commit_from_version(git_obj: Git, purl: PackageURL) -> tuple[str, str]:
Expand All @@ -44,8 +46,15 @@ def get_commit_from_version(git_obj: Git, purl: PackageURL) -> tuple[str, str]:
return "", ""
logger.debug("Searching for commit of artifact version using tags: %s@%s", purl.name, purl.version)

target_version_pattern = _build_version_pattern(purl.version)
has_name_pattern = re.compile(f".*{re.escape(purl.name)}.*[0-9].*", flags=re.IGNORECASE)
target_version_pattern, parts = _build_version_pattern(purl.version)
# The has_name_pattern tries to determine whether a given tag contains the name of the current artifact as a prefix
# and at least one number that could belong to a version.
# .* : Any number of irrelevant pre-prefix characters.
# purl.name : The artifact name (escaped).
# [^a-z0-9]{1,3} : 1 to 3 non-numeric characters.
# [0-9] : A single numeric character.
# .* : The remainder of the version and possible suffixes.
has_name_pattern = re.compile(f".*{re.escape(purl.name)}[^{ALPHANUMERIC}]" + "{1,3}[0-9].*", flags=re.IGNORECASE)

# Tags are examined as followed:
# - Any without a corresponding commit are discarded.
Expand All @@ -70,9 +79,15 @@ def get_commit_from_version(git_obj: Git, purl: PackageURL) -> tuple[str, str]:

# Match tags.
if named_tags:
matched_tags = _match_tags(named_tags, target_version_pattern)
matched_tags = _match_tags(named_tags, target_version_pattern, parts)
if not matched_tags:
# See if there are any possible matches from non-named tags.
possible_matches = _match_tags(other_tags, target_version_pattern, parts)
if possible_matches:
# TODO Determine whether to accept a possible match.
logger.debug("Possible tag matches: %s", possible_matches)
else:
matched_tags = _match_tags(other_tags, target_version_pattern)
matched_tags = _match_tags(other_tags, target_version_pattern, parts)

# Report matched tags.
if not matched_tags:
Expand Down Expand Up @@ -116,7 +131,7 @@ def get_commit_from_version(git_obj: Git, purl: PackageURL) -> tuple[str, str]:
return "", ""


def _build_version_pattern(version: str) -> Pattern:
def _build_version_pattern(version: str) -> tuple[Pattern, list[str]]:
"""Build a version pattern to match the passed version string.
Parameters
Expand All @@ -126,8 +141,8 @@ def _build_version_pattern(version: str) -> Pattern:
Returns
-------
Pattern
The regex pattern that will match the version.
tuple[Pattern, list[str]]
The tuple of the regex pattern that will match the version, and the list of version parts that were extracted.
"""
# The version is split on non-alphanumeric characters to separate the version parts from the non-version parts.
Expand All @@ -139,24 +154,32 @@ def _build_version_pattern(version: str) -> Pattern:
split = [version]

this_version_pattern = ""
for part in split:
parts = []
for count, part in enumerate(split):
# Validate the split part by checking it is only comprised of alphanumeric characters.
valid = validation_pattern.match(part)
if not valid:
continue
parts.append(part)
# If the final version part has at least one alphabetic character in it, consider it to be optional.
# This requires wrapping it and the previous INFIX in brackets.
has_alphabetic_suffix = count == len(split) - 1 and alphabetic_pattern.match(str(part))
if has_alphabetic_suffix:
this_version_pattern = this_version_pattern + "("
if this_version_pattern:
# Between one and three non-numeric characters are accepted between the version parts.
# This balances the tradeoff between maximal matching and minimal false positives.
# INFIX matches separators as described where it is instantiated.
this_version_pattern = this_version_pattern + INFIX
this_version_pattern = this_version_pattern + str(part)
if has_alphabetic_suffix:
this_version_pattern = this_version_pattern + ")?"

# Prepend the optional prefix, add a named capture group for the version, and enforce end of string analysis.
this_version_pattern = PREFIX + "(?P<version>" + this_version_pattern + ")" + "$"
logger.debug("Created pattern: %s", this_version_pattern)
return re.compile(this_version_pattern, flags=re.IGNORECASE)
return re.compile(this_version_pattern, flags=re.IGNORECASE), parts


def _match_tags(tag_list: list[TagReference], pattern: Pattern) -> list[TagReference]:
def _match_tags(tag_list: list[TagReference], pattern: Pattern, parts: list[str]) -> list[TagReference]:
"""Return items of the passed tag list that match the passed pattern.
Parameters
Expand All @@ -165,10 +188,13 @@ def _match_tags(tag_list: list[TagReference], pattern: Pattern) -> list[TagRefer
The list of tags to check.
pattern: Pattern
The pattern to match against.
parts: list[str]
The list of version parts extracted from the version.
Returns
-------
The list of tags that matched the pattern.
list[TagReference]
The list of tags that matched the pattern.
"""
matched_tags = []
for tag in tag_list:
Expand All @@ -177,4 +203,29 @@ def _match_tags(tag_list: list[TagReference], pattern: Pattern) -> list[TagRefer
if not match:
continue
matched_tags.append(tag)
if len(matched_tags) > 1:
# Sort the matched tags so that tags that contain all the version parts are prioritised.
matched_tags.sort(key=lambda item: _count_parts_in_tag(str(item), parts))
return matched_tags


def _count_parts_in_tag(tag: str, parts: list[str]) -> int:
"""Count how many version part strings are contained in the tag.
Parameters
----------
tag: str
The tag to be checked.
parts: list[str]
The version parts extracted from the version string.
Returns
-------
int
The count of contained parts subtracted from the number of parts.
"""
count = len(parts)
for part in parts:
if part in tag:
count = count - 1
return count
45 changes: 20 additions & 25 deletions tests/repo_finder/test_repo_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""This module tests the repo finder."""
import os
import shutil
from pathlib import Path

import pytest
Expand All @@ -12,7 +13,7 @@
from macaron.config.target_config import Configuration
from macaron.repo_finder import commit_finder
from macaron.slsa_analyzer.analyzer import Analyzer
from tests.slsa_analyzer.mock_git_utils import add_tag_if_not_present, commit_nothing, initiate_repo
from tests.slsa_analyzer.mock_git_utils import add_new_commit_with_tag, initiate_repo


@pytest.mark.parametrize(
Expand Down Expand Up @@ -79,39 +80,33 @@ def test_resolve_analysis_target(
def test_get_commit_from_version() -> None:
"""Test resolving commits from version tags."""
path = Path(__file__).parent.joinpath("mock_repo")
init_repo = not os.path.exists(path)
if os.path.exists(path):
shutil.rmtree(path)
git_obj: Git = initiate_repo(path)
if init_repo:
tags = [
"test-name-v1.0.1-A",
"v1.0.1-B",
"v1.0.3+test",
"v_1.0.5",
"50_0_2",
"r78rv109",
]
# Add a commit for each tag with a message that can be verified later.
for count, value in enumerate(tags):
commit_nothing(git_obj, str(count))
add_tag_if_not_present(git_obj, value)

tags = ["test-name-v1.0.1-A", "v1.0.3+test", "v_1.0.5", "50_0_2", "r78rv109", "1.0.5-JRE"]
# Add a commit for each tag that can be verified later.
hash_targets = []
for tag in tags:
hash_targets.append(add_new_commit_with_tag(git_obj, tag))

# Perform tests
versions = [
"1.0.1-A",
"1.0.1-B",
"1.0.3+test",
"1.0.5",
"50.0.2",
"78.109",
"1.0.1-A", # To match a tag with a named suffix.
"1.0.3+test", # To match a tag with a '+' suffix.
"1.0.5", # To match a tag with a 'v_' prefix.
"50.0.2", # To match a tag separated by '_'.
"78.109", # To match a tag separated by characters 'r' 'rv'.
"1.0.5-JRE", # To NOT match the similar tag without the 'JRE' suffix.
]
purl_name = "test-name"
for count, value in enumerate(versions):
_test_version(git_obj, PackageURL(type="maven", name=purl_name, version=value), str(count))
_test_version(git_obj, PackageURL(type="maven", name=purl_name, version=value), hash_targets[count])
purl_name = "test-name" + "-" + str(count + 1)


def _test_version(git_obj: Git, purl: PackageURL, commit_message: str) -> None:
"""Retrieve commit matching version and check commit message is correct."""
def _test_version(git_obj: Git, purl: PackageURL, hash_target: str) -> None:
"""Retrieve commit matching version and check commit hash is correct."""
branch, digest = commit_finder.get_commit_from_version(git_obj, purl)
assert branch
assert git_obj.get_commit(digest).msg == commit_message
assert git_obj.get_commit(digest).hash == hash_target
18 changes: 10 additions & 8 deletions tests/slsa_analyzer/mock_git_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def commit_files(git_wrapper: Git, file_names: list) -> bool:
return False


def commit_nothing(git_wrapper: Git, message: str = "") -> bool:
def commit_nothing(git_wrapper: Git, message: str = "") -> str:
"""Create an empty commit in the repository indicated by the git_wrapper.
Parameters
Expand All @@ -79,18 +79,18 @@ def commit_nothing(git_wrapper: Git, message: str = "") -> bool:
Returns
-------
bool
True if succeed else False.
str
The commit sha or an empty string if unsuccessful.
"""
try:
# Store the index object as recommended by the documentation
current_index = git_wrapper.repo.index
if not message:
message = "Empty commit"
current_index.commit(message)
return True
commit = current_index.commit(message)
return str(commit.hexsha)
except GitError:
return False
return ""


def prepare_repo_for_testing(
Expand Down Expand Up @@ -137,7 +137,7 @@ def prepare_repo_for_testing(
return analyze_ctx


def add_tag_if_not_present(git_obj: Git, tag: str) -> None:
def add_new_commit_with_tag(git_obj: Git, tag: str) -> str:
"""Add passed tag to repository if not already present.
Parameters
Expand All @@ -148,5 +148,7 @@ def add_tag_if_not_present(git_obj: Git, tag: str) -> None:
The tag to possibly add.
"""
if tag in git_obj.repo.tags:
return
return ""
sha = commit_nothing(git_obj)
git_obj.repo.create_tag(tag)
return sha

0 comments on commit f7259eb

Please sign in to comment.