Skip to content

Commit

Permalink
Add workaround for applying backport from PR-6064 with django-command
Browse files Browse the repository at this point in the history
  • Loading branch information
pedro-psb committed Jan 8, 2025
1 parent 18cb45e commit 43a4de6
Show file tree
Hide file tree
Showing 9 changed files with 149 additions and 32 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/scripts/script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,13 @@ cmd_user_prefix bash -c "django-admin makemigrations core --check --dry-run"
cmd_user_prefix bash -c "django-admin makemigrations file --check --dry-run"
cmd_user_prefix bash -c "django-admin makemigrations certguard --check --dry-run"

# See pulpcore.app.util.ENABLE_6064_BACKPORT_WORKAROUND for context.
# This needs to be set here because it relies on service init.
# Its being tested in only one scenario to have both cases covered.
if [[ "$TEST" == "s3" ]]; then
cmd_prefix pulpcore-manager backport-patch-6064
fi

# Run unit tests.
cmd_user_prefix bash -c "PULP_DATABASES__default__USER=postgres pytest -v -r sx --color=yes --suppress-no-test-exit-code -p no:pulpcore --pyargs pulpcore.tests.unit"
cmd_user_prefix bash -c "PULP_DATABASES__default__USER=postgres pytest -v -r sx --color=yes --suppress-no-test-exit-code -p no:pulpcore --pyargs pulp_file.tests.unit"
Expand Down
4 changes: 4 additions & 0 deletions CHANGES/5725.bugfix
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@ On a request for on-demand content in the content app, a corrupted Remote that
contains the wrong binary (for that content) prevented other Remotes from being
attempted on future requests. Now the last failed Remotes are temporarily ignored
and others may be picked.

Because the [original](https://github.com/pulp/pulpcore/pull/6064) contains a migraton,
this is backported here as an optional patch which can be enabled by running the
pulpcore-manager command: `backport-patch-6064`.
9 changes: 9 additions & 0 deletions pulpcore/app/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,15 @@ def ready(self):
super().ready()
from . import checks # noqa

# Backport workaround for https://github.com/pulp/pulpcore/pull/6064
from pulpcore.app.models import RemoteArtifact
from django.db import models
import pulpcore.app.util

if pulpcore.app.util.failed_at_exists(connection, RemoteArtifact):
pulpcore.app.util.ENABLE_6064_BACKPORT_WORKAROUND = True
RemoteArtifact.add_to_class("failed_at", models.DateTimeField(null=True))

post_migrate.connect(
_ensure_default_domain, sender=self, dispatch_uid="ensure_default_domain"
)
Expand Down
77 changes: 77 additions & 0 deletions pulpcore/app/management/commands/backport-patch-6064.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from django.core.management.base import BaseCommand
from gettext import gettext as _
from django.db import connection
from pulpcore.app.models import RemoteArtifact


CHECK_COL_QUERY = """
SELECT COUNT(*)
FROM information_schema.columns
WHERE table_name = %s
AND column_name = %s;
"""

MODIFY_QUERY_TMPL = """
ALTER TABLE {}
ADD COLUMN {} TIMESTAMPTZ DEFAULT NULL;
"""

HELP = _(
"""
Enables patch backport of #6064 (https://github.com/pulp/pulpcore/pull/6064).
The fix prevents corrupted remotes from making content unreacahble by adding
a cooldown time, which is tracked by a new field, 'RemoteArtifact.failed_at'.
This command adds the field to the appropriate table.
"""
)


class Command(BaseCommand):
help = HELP

def add_arguments(self, parser):
parser.add_argument(
"--dry-run",
action="store_true",
help="Run the migration in dry-run mode without saving changes",
)

def handle(self, *args, **options):
dry_run = options.get("dry_run", False)
try:
with connection.cursor() as cursor:
# Check if column already exists
table_name = RemoteArtifact._meta.db_table
field_name = "failed_at"
cursor.execute(CHECK_COL_QUERY, [table_name, field_name])
field_exists = cursor.fetchone()[0] > 0
if field_exists:
self._print_success(f"Field '{table_name}.{field_name}' already exists.")
self._print_success("Nothing to be done")
return

# Add field to table
self._print_info(f"Adding {field_name!r} column to {table_name!r}...")
MODIFY_QUERY = MODIFY_QUERY_TMPL.format(table_name, field_name)
if not dry_run:
cursor.execute(MODIFY_QUERY)
self._print_success("Done")
else:
self._print_warn("[DRY-RUN] SQL that would be executed:")
self._print_info(MODIFY_QUERY)
except Exception as e:
self._print_error(f"Migration failed: {str(e)}")
raise

def _print_info(self, msg):
self.stdout.write(msg)

def _print_success(self, msg):
self.stdout.write(self.style.SUCCESS(msg))

def _print_error(self, msg):
self.stdout.write(self.style.ERROR(msg))

def _print_warn(self, msg):
self.stdout.write(self.style.WARNING(msg))
18 changes: 0 additions & 18 deletions pulpcore/app/migrations/0126_remoteartifact_failed_at.py

This file was deleted.

1 change: 0 additions & 1 deletion pulpcore/app/models/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,6 @@ class RemoteArtifact(BaseModel, QueryMixin):
sha256 = models.CharField(max_length=64, null=True, db_index=True)
sha384 = models.CharField(max_length=96, null=True, db_index=True)
sha512 = models.CharField(max_length=128, null=True, db_index=True)
failed_at = models.DateTimeField(null=True)

content_artifact = models.ForeignKey(ContentArtifact, on_delete=models.CASCADE)
remote = models.ForeignKey("Remote", on_delete=models.CASCADE)
Expand Down
22 changes: 22 additions & 0 deletions pulpcore/app/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,28 @@
from pulpcore.exceptions.validation import InvalidSignatureError


# Backport workaround for https://github.com/pulp/pulpcore/pull/6064
# If 'pulpcore-manager backport-patch-6064' was run, the field
# RemoteArtifact.failed_at will be set and the backport will take effect.
ENABLE_6064_BACKPORT_WORKAROUND = False


def failed_at_exists(connection, ra_class) -> bool:
"""Whtether 'failed_at' exists in the database."""
table_name = ra_class._meta.db_table
field_name = "failed_at"
CHECK_COL_QUERY = """
SELECT COUNT(*)
FROM information_schema.columns
WHERE table_name = %s
AND column_name = %s;
"""
with connection.cursor() as cursor:
cursor.execute(CHECK_COL_QUERY, [table_name, field_name])
field_exists = cursor.fetchone()[0] > 0
return field_exists


# a little cache so viewset_for_model doesn't have to iterate over every app every time
_model_viewset_cache = {}
STRIPPED_API_ROOT = settings.API_ROOT.strip("/")
Expand Down
33 changes: 22 additions & 11 deletions pulpcore/content/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
MetricsEmitter,
get_domain,
cache_key,
ENABLE_6064_BACKPORT_WORKAROUND,
)

from pulpcore.exceptions import ( # noqa: E402
Expand Down Expand Up @@ -852,12 +853,15 @@ async def _stream_content_artifact(self, request, response, content_artifact):
ClientConnectionError,
)

protection_time = settings.REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN
remote_artifacts = (
content_artifact.remoteartifact_set.select_related("remote")
.order_by_acs()
.exclude(failed_at__gte=timezone.now() - timedelta(seconds=protection_time))
)
remote_artifacts = content_artifact.remoteartifact_set.select_related(
"remote"
).order_by_acs()

if ENABLE_6064_BACKPORT_WORKAROUND:
protection_time = settings.REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN
remote_artifacts = remote_artifacts.exclude(
failed_at__gte=timezone.now() - timedelta(seconds=protection_time)
)
async for remote_artifact in remote_artifacts:
try:
response = await self._stream_remote_artifact(request, response, remote_artifact)
Expand Down Expand Up @@ -1166,18 +1170,25 @@ async def finalize():
try:
download_result = await downloader.run()
except DigestValidationError:
remote_artifact.failed_at = timezone.now()
await remote_artifact.asave()
COOLDOWN_MSG = ""
if ENABLE_6064_BACKPORT_WORKAROUND:
remote_artifact.failed_at = timezone.now()
await remote_artifact.asave()
REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN = (
settings.REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN
)
COOLDOWN_MSG = (
"- Marking this Remote to be ignored for "
f"{REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN=}s.\n\n"
)
await downloader.session.close()
close_tcp_connection(request.transport._sock)
REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN = settings.REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN
raise RuntimeError(
f"Pulp tried streaming {remote_artifact.url!r} to "
"the client, but it failed checksum validation.\n\n"
"We can't recover from wrong data already sent so we are:\n"
"- Forcing the connection to close.\n"
"- Marking this Remote to be ignored for "
f"{REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN=}s.\n\n"
f"{COOLDOWN_MSG}"
"If the Remote is known to be fixed, try resyncing the associated repository.\n"
"If the Remote is known to be permanently corrupted, try removing "
"affected Pulp Remote, adding a good one and resyncing.\n"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,5 +236,11 @@ def download_from_distribution(content, distribution):
download_from_distribution(content_name, distribution)

# WHEN/THEN (second request)
actual_checksum = download_from_distribution(content_name, distribution)
assert actual_checksum == expected_checksum
from pulpcore.app.util import ENABLE_6064_BACKPORT_WORKAROUND

if ENABLE_6064_BACKPORT_WORKAROUND:
actual_checksum = download_from_distribution(content_name, distribution)
assert actual_checksum == expected_checksum
else:
with pytest.raises(ClientPayloadError, match="Response payload is not completed"):
download_from_distribution(content_name, distribution)

0 comments on commit 43a4de6

Please sign in to comment.