diff --git a/digiarch/__version__.py b/digiarch/__version__.py index 79e4386b..67fc7e83 100644 --- a/digiarch/__version__.py +++ b/digiarch/__version__.py @@ -1 +1 @@ -__version__ = "3.2.4" +__version__ = "3.2.5" diff --git a/digiarch/doctor.py b/digiarch/doctor.py index aea83b12..eec26183 100644 --- a/digiarch/doctor.py +++ b/digiarch/doctor.py @@ -62,16 +62,16 @@ def sanitize_paths(ctx: Context, database: FileDB, root: Path, dry_run: bool, *l def deduplicate_extensions(ctx: Context, database: FileDB, root: Path, dry_run: bool, *loggers: Logger | None): + database.create_function("__reverse", 1, lambda s: s[::-1]) for file in database.files.select( - where="instr(reverse(relative_path), '.') != 0" + where="instr(__reverse(relative_path), '.') != 0" " and relative_path like '%' ||" - " substr(relative_path, length(relative_path) - instr(reverse(relative_path), '.') + 1) ||" - " substr(relative_path, length(relative_path) - instr(reverse(relative_path), '.') + 1)" + " substr(relative_path, length(relative_path) - instr(__reverse(relative_path), '.') + 1) ||" + " substr(relative_path, length(relative_path) - instr(__reverse(relative_path), '.') + 1)" ): old_suffixes: list[str] = [s.lower() for s in file.suffixes.split(".") if s] - new_suffixes: list[str] = [s.lower() for s in old_suffixes] # Deduplicate suffixes - new_suffixes = sorted(set(new_suffixes), key=new_suffixes.index) + new_suffixes: list[str] = sorted(set(old_suffixes), key=old_suffixes.index) # Restore original letter case new_suffixes = [next(s2 for s2 in old_suffixes if s2.lower() == s) for s in new_suffixes] old_name: str = file.name diff --git a/poetry.lock b/poetry.lock index aca2f155..64e41a55 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2,7 +2,7 @@ [[package]] name = "acacore" -version = "3.0.4" +version = "3.0.5" description = "" optional = false python-versions = "^3.11" @@ -19,8 +19,8 @@ pyyaml = "^6.0.2" [package.source] type = "git" url = "https://github.com/aarhusstadsarkiv/acacore.git" -reference = "v3.0.4" -resolved_reference = "68acebb001166a7e1246c00fae1afeeb62fc6267" +reference = "v3.0.5" +resolved_reference = "b76620779212498201c95ddc25431f860e5d6340" [[package]] name = "annotated-types" @@ -346,13 +346,13 @@ files = [ [[package]] name = "extract-msg" -version = "0.48.7" +version = "0.49.0" description = "Extracts emails and attachments saved in Microsoft Outlook's .msg files" optional = false python-versions = ">=3.8" files = [ - {file = "extract_msg-0.48.7-py3-none-any.whl", hash = "sha256:0477489aa2ac417387803f19fa53ddc44136846a648b0898a114212272a1a111"}, - {file = "extract_msg-0.48.7.tar.gz", hash = "sha256:3ddf015c0e0a6ea36026fedfb7f8e434ca37150a31069363b2d0752196d15b6e"}, + {file = "extract_msg-0.49.0-py3-none-any.whl", hash = "sha256:6a1756164ef2d0c230bce1966d52155da8bd4bec9a6a1c3166cbdff8ffd9e0ba"}, + {file = "extract_msg-0.49.0.tar.gz", hash = "sha256:cc700cdcc0cb6fcdbfa3b9e477201958cc28c584716d5c45fdd47261c1f2dfcc"}, ] [package.dependencies] @@ -886,4 +886,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "a05d96b6e62e404ba998f36aab0d39076f59f0a533c67e9aaa1531e9c01e9fc7" +content-hash = "0446d48c93cffcffcefe8a771b89b421c995faf5704ee4addb36ca2567ea166a" diff --git a/pyproject.toml b/pyproject.toml index 130bbf66..9430dfe4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,8 @@ [tool.poetry] name = "digiarch" -version = "3.2.4" +version = "3.2.5" description = "Tools for the Digital Archive Project at Aarhus Stadsarkiv" -authors = ["Aryan Muhammadi Landi ", "Nina Jensen ", "Aarhus Stadsarkiv "] +authors = ["Aarhus Stadsarkiv "] license = "GPL-3.0" readme = "README.md" homepage = "https://stadsarkiv.aarhus.dk/" @@ -12,10 +12,10 @@ include = ["pyproject.toml"] [tool.poetry.dependencies] python = "^3.11" -acacore = {git = "https://github.com/aarhusstadsarkiv/acacore.git", tag = "v3.0.4"} +acacore = {git = "https://github.com/aarhusstadsarkiv/acacore.git", tag = "v3.0.5"} patool = "^2.4.0" tnefparse = "^1.4.0" -extract-msg = "^0.48.7" +extract-msg = "^0.49.0" chardet = "^5.2.0" [tool.poetry.group.dev.dependencies] diff --git a/tests/files/_metadata/files.db b/tests/files/_metadata/files.db index 80eea0e5..b54dc82b 100644 Binary files a/tests/files/_metadata/files.db and b/tests/files/_metadata/files.db differ diff --git a/tests/test_cli.py b/tests/test_cli.py index 2fe3f035..a8dd6502 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -18,6 +18,7 @@ from pydantic import BaseModel from digiarch.cli import app +from digiarch.doctor import command_doctor from digiarch.edit.action import group_action from digiarch.edit.edit import group_edit from digiarch.edit.lock import command_lock @@ -149,6 +150,107 @@ def test_reidentify(tests_folder: Path, files_folder: Path, files_folder_copy: P assert "extension mismatch" not in (file_new.warning or []) +# noinspection DuplicatedCode +def test_extract(tests_folder: Path, files_folder: Path, files_folder_copy: Path): + database_path: Path = files_folder / "_metadata" / "files.db" + database_path_copy: Path = files_folder_copy / database_path.relative_to(files_folder) + database_path_copy.parent.mkdir(parents=True, exist_ok=True) + copy(database_path, database_path_copy) + + with FileDB(database_path_copy) as database: + files: list[File] = list(database.files.select(where="action = 'extract'")) + + app.main( + [ + command_extract.name, + str(files_folder_copy), + "--actions", + str(tests_folder / "fileformats.yml"), + "--custom-signatures", + str(tests_folder / "custom_signatures.yml"), + "--siegfried-home", + str(tests_folder), + ], + standalone_mode=False, + ) + + with FileDB(database_path_copy) as database: + for file in files: + file2 = database.files.select(where="uuid = ?", parameters=[str(file.uuid)]).fetchone() + assert file2 + if file.relative_path.name.split(".", 1)[0].endswith("-encrypted"): + assert file2.action == "ignore" + assert file2.action_data.ignore.template == "password-protected" + assert file2.processed is True + elif on_success_action := file.action_data.extract.on_success: + assert file2.action == on_success_action + else: + assert file2.action == "ignore" + assert file2.action_data.ignore.template == "extracted-archive" + assert file2.processed is True + assert database.files.select(where="parent = ?", parameters=[str(file.uuid)]).fetchone() + + +# noinspection DuplicatedCode +def test_doctor_paths(tests_folder: Path, files_folder: Path, files_folder_copy: Path): + database_path: Path = files_folder / "_metadata" / "files.db" + database_path_copy: Path = files_folder_copy / database_path.relative_to(files_folder) + database_path_copy.parent.mkdir(parents=True, exist_ok=True) + copy(database_path, database_path_copy) + + with FileDB(database_path_copy) as database: + files: list[File] = list(database.files.select()) + for file in files: + file.root = files_folder_copy + file.relative_path = ( + file.get_absolute_path() + .rename(file.get_absolute_path().with_name("*" + file.name)) + .relative_to(files_folder_copy) + ) + database.files.update({"relative_path": file.relative_path}, {"uuid": file.uuid}) + database.commit() + + app.main([command_doctor.name, str(files_folder_copy), "--fix", "paths"], standalone_mode=False) + + with FileDB(database_path_copy) as database: + for file in files: + file2: File | None = database.files.select(where="uuid = ?", parameters=[str(file.uuid)]).fetchone() + assert file2 + assert not file.get_absolute_path(files_folder_copy).is_file() + assert file2.get_absolute_path(files_folder_copy).is_file() + assert file2.relative_path == file.relative_path.with_name(file.name.replace("*", "_")) + + +# noinspection DuplicatedCode +def test_doctor_extensions(tests_folder: Path, files_folder: Path, files_folder_copy: Path): + database_path: Path = files_folder / "_metadata" / "files.db" + database_path_copy: Path = files_folder_copy / database_path.relative_to(files_folder) + database_path_copy.parent.mkdir(parents=True, exist_ok=True) + copy(database_path, database_path_copy) + + with FileDB(database_path_copy) as database: + files: list[File] = [f for f in database.files.select() if f.suffix] + for file in files: + file.root = files_folder_copy + file.relative_path = ( + file.get_absolute_path() + .rename(file.get_absolute_path().with_name(file.name + file.suffix)) + .relative_to(files_folder_copy) + ) + database.files.update({"relative_path": file.relative_path}, {"uuid": file.uuid}) + database.commit() + + app.main([command_doctor.name, str(files_folder_copy), "--fix", "extensions"], standalone_mode=False) + + with FileDB(database_path_copy) as database: + for file in files: + file2: File | None = database.files.select(where="uuid = ?", parameters=[str(file.uuid)]).fetchone() + assert file2 + assert not file.get_absolute_path(files_folder_copy).is_file() + assert file2.get_absolute_path(files_folder_copy).is_file() + assert file2.relative_path == file.relative_path.with_name(file.name.removesuffix(file.suffix)) + + # noinspection DuplicatedCode def test_history(tests_folder: Path, files_folder: Path): app.main( @@ -785,44 +887,3 @@ def test_rollback_extract(tests_folder: Path, files_folder: Path, files_folder_c file3: File | None = database.files.select(where="uuid = ?", parameters=[str(file.uuid)]).fetchone() assert not file3 assert not file3.get_absolute_path(files_folder_copy).is_file() - - -# noinspection DuplicatedCode -def test_extract(tests_folder: Path, files_folder: Path, files_folder_copy: Path): - database_path: Path = files_folder / "_metadata" / "files.db" - database_path_copy: Path = files_folder_copy / database_path.relative_to(files_folder) - database_path_copy.parent.mkdir(parents=True, exist_ok=True) - copy(database_path, database_path_copy) - - with FileDB(database_path_copy) as database: - files: list[File] = list(database.files.select(where="action = 'extract'")) - - app.main( - [ - command_extract.name, - str(files_folder_copy), - "--actions", - str(tests_folder / "fileformats.yml"), - "--custom-signatures", - str(tests_folder / "custom_signatures.yml"), - "--siegfried-home", - str(tests_folder), - ], - standalone_mode=False, - ) - - with FileDB(database_path_copy) as database: - for file in files: - file2 = database.files.select(where="uuid = ?", parameters=[str(file.uuid)]).fetchone() - assert file2 - if file.relative_path.name.split(".", 1)[0].endswith("-encrypted"): - assert file2.action == "ignore" - assert file2.action_data.ignore.template == "password-protected" - assert file2.processed is True - elif on_success_action := file.action_data.extract.on_success: - assert file2.action == on_success_action - else: - assert file2.action == "ignore" - assert file2.action_data.ignore.template == "extracted-archive" - assert file2.processed is True - assert database.files.select(where="parent = ?", parameters=[str(file.uuid)]).fetchone()