Skip to content

Commit

Permalink
add + refactor: option to remove local packages that would _not_ fit …
Browse files Browse the repository at this point in the history
…includelist/excludelist
  • Loading branch information
YYYasin19 committed Aug 23, 2023
1 parent c3c5467 commit b2af339
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 39 deletions.
107 changes: 72 additions & 35 deletions quetz/tasks/mirror.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@
from quetz.condainfo import CondaInfo, get_subdir_compat
from quetz.config import Config
from quetz.dao import Dao
from quetz.db_models import PackageVersion
from quetz.db_models import Channel, PackageVersion
from quetz.errors import DBError
from quetz.pkgstores import PackageStore
from quetz.tasks import indexing
from quetz.utils import (
MembershipAction,
TicToc,
add_static_file,
check_package_membership,
MembershipAction,
)

# copy common subdirs from conda:
Expand Down Expand Up @@ -421,23 +421,13 @@ def handle_batch(update_batch):

return False

# TODO: also remove all packages that are not in the remote repository anymore
# practically re-write the complete sync mechanism?

# SYNC: Remote -> Local
# for each package in the remote repository:
# validate if it should be downloaded to this channel
# also: remove packages if they are not supposed to in this channel anymore
# go through all packages from remote channel
for repo_package_name, metadata in packages.items():
# find action to do with package
action = check_package_membership(
channel, repo_package_name, metadata, remote_host=remote_repository.host
)
if action == MembershipAction.INCLUDE:
path = os.path.join(arch, repo_package_name)

# try to find out whether it's a new package version

is_uptodate = None
for _check in version_checks:
is_uptodate = _check(repo_package_name, metadata)
Expand All @@ -447,20 +437,23 @@ def handle_batch(update_batch):
# if package is up-to-date skip uploading file
if is_uptodate:
continue
else:
logger.debug(f"updating package {repo_package_name} from {arch}")

logger.debug(f"updating package {repo_package_name} from {arch}")

path = os.path.join(arch, repo_package_name)
update_batch.append((path, repo_package_name, metadata))
update_size += metadata.get('size', 100_000)
elif action == MembershipAction.NOTHING:
logger.debug(
f"package {repo_package_name} not needed by {remote_repository.host} but other channels."
f"package {repo_package_name} not needed by "
f"{remote_repository.host} but other channels"
)
else:
logger.debug(
f"package {repo_package_name} not needed by {remote_repository.host} and no other channels."
f"package {repo_package_name} not needed by "
f"{remote_repository.host} and no other channels."
)
# TODO: only add to remove if exists.
# TODO: only add to remove if exists in this (mirror) channel.
remove_batch.append((arch, repo_package_name))

# perform either downloads or removals
Expand All @@ -473,30 +466,74 @@ def handle_batch(update_batch):
# handle final batch
any_updated |= handle_batch(update_batch)

# SYNC: Local checks Remote
# Validate if all packages in this channel are still
# also present in the remote channel
# if not: add them to the remove batch as well
# TODO

# remove packages marked for removal
if remove_batch:
logger.debug(f"Removing {len(remove_batch)} packages: {remove_batch}")
package_specs_remove = set([p[1].split("-")[0] for p in remove_batch])
for package_specs in package_specs_remove:
dao.remove_package(channel_name, package_name=package_specs)
# TODO: is this needed every time?
dao.cleanup_channel_db(channel_name, package_name=package_specs)
# only remove if exists
if pkgstore.file_exists(channel.name, package_specs):
pkgstore.delete_file(channel.name, destination=package_specs)

any_updated |= True
any_updated |= remove_packages(remove_batch, channel, dao, pkgstore)

if any_updated:
# build local repodata
indexing.update_indexes(dao, pkgstore, channel_name, subdirs=[arch])


def remove_packages(remove_batch, channel: Channel, dao, pkgstore) -> bool:
logger.debug(f"Removing {len(remove_batch)} packages: {remove_batch}")
removal_performed = False
package_specs_remove = set([p[1].split("-")[0] for p in remove_batch])
for package_specs in package_specs_remove:
dao.remove_package(channel.name, package_name=package_specs)
# TODO: is this needed every time?
dao.cleanup_channel_db(channel.name, package_name=package_specs)
if pkgstore.file_exists(channel.name, package_specs):
pkgstore.delete_file(channel.name, destination=package_specs)
removal_performed |= True

return removal_performed


def remove_local_packages(
channel_name: str,
dao: Dao,
pkgstore: PackageStore,
auth: authorization.Rules,
includelist: List[str | dict] = None,
excludelist: List[str | dict] = None,
):
"""
For each package in the channel, check if it is in the includelist and
not in the excludelist. If not, remove it from the channel and the package store.
We assume that the includelist and excludelist are well-formed,
e.g. they don't overlap.
"""
from utils import _all_matching_hosts

channel = dao.get_channel(channel_name)
if not channel:
logger.error(f"channel {channel_name} not found")
return

if not includelist:
includelist = []
if not excludelist:
excludelist = []

packages_to_remove = []

for package in channel.packages:
info = package.current_package_version
name, version, build_string = info.package_name, info.version, info.build_string
# check: does this package match _any_ of the includelist patterns?
if not _all_matching_hosts(name, version, build_string, includelist):
packages_to_remove.append((None, name))

# check: does this package match _any_ of the excludelist patterns?
if _all_matching_hosts(name, version, build_string, excludelist):
packages_to_remove.append((None, name))

if packages_to_remove:
remove_packages(packages_to_remove, channel, dao, pkgstore)


def create_packages_from_channeldata(
channel_name: str, user_id: bytes, channeldata: dict, dao: Dao
):
Expand Down
9 changes: 5 additions & 4 deletions quetz/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
import time
import traceback
import uuid
from enum import Enum
from datetime import datetime, timezone
from enum import Enum
from functools import wraps
from pathlib import Path
from typing import Any, Callable
Expand Down Expand Up @@ -78,8 +78,10 @@ def _all_matching_hosts(
include_or_exclude_list: dict, package_spec: tuple[str, str, str]
) -> list[str]:
"""
Return the names of all matching hosts from the includelist that whould allow _this_ package spec.
include_or_exclude_list: e.g. { "remote1": ["numpy", "pandas"], "remote2": ["r-base"]}
Return the names of all matching hosts from the includelist
that whould allow _this_ package spec.
include_or_exclude_list:
e.g. { "remote1": ["numpy", "pandas"], "remote2": ["r-base"]}
"""
name, version, build = package_spec
matching_hosts = []
Expand Down Expand Up @@ -116,7 +118,6 @@ def check_package_membership(
incl_act = MembershipAction.NOTHING
exclude_now = False
if (includelist := metadata['includelist']) is not None:
incl_act = False # default to False if includelist is defined
# Example: { "main": ["numpy", "pandas"], "r": ["r-base"]}
if isinstance(includelist, dict):
matches = _all_matching_hosts(includelist, package_spec)
Expand Down

0 comments on commit b2af339

Please sign in to comment.