From d7edf56f25d3827be5322949fbff1c243d11d174 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Thu, 22 Aug 2024 09:26:29 +0200 Subject: [PATCH 01/10] tests - sort functions --- tests/test_cli.py | 82 +++++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 2fe3f035..a941b030 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -149,6 +149,47 @@ def test_reidentify(tests_folder: Path, files_folder: Path, files_folder_copy: P assert "extension mismatch" not in (file_new.warning or []) +# noinspection DuplicatedCode +def test_extract(tests_folder: Path, files_folder: Path, files_folder_copy: Path): + database_path: Path = files_folder / "_metadata" / "files.db" + database_path_copy: Path = files_folder_copy / database_path.relative_to(files_folder) + database_path_copy.parent.mkdir(parents=True, exist_ok=True) + copy(database_path, database_path_copy) + + with FileDB(database_path_copy) as database: + files: list[File] = list(database.files.select(where="action = 'extract'")) + + app.main( + [ + command_extract.name, + str(files_folder_copy), + "--actions", + str(tests_folder / "fileformats.yml"), + "--custom-signatures", + str(tests_folder / "custom_signatures.yml"), + "--siegfried-home", + str(tests_folder), + ], + standalone_mode=False, + ) + + with FileDB(database_path_copy) as database: + for file in files: + file2 = database.files.select(where="uuid = ?", parameters=[str(file.uuid)]).fetchone() + assert file2 + if file.relative_path.name.split(".", 1)[0].endswith("-encrypted"): + assert file2.action == "ignore" + assert file2.action_data.ignore.template == "password-protected" + assert file2.processed is True + elif on_success_action := file.action_data.extract.on_success: + assert file2.action == on_success_action + else: + assert file2.action == "ignore" + assert file2.action_data.ignore.template == "extracted-archive" + assert file2.processed is True + assert database.files.select(where="parent = ?", parameters=[str(file.uuid)]).fetchone() + + # noinspection DuplicatedCode def test_history(tests_folder: Path, files_folder: Path): app.main( @@ -785,44 +826,3 @@ def test_rollback_extract(tests_folder: Path, files_folder: Path, files_folder_c file3: File | None = database.files.select(where="uuid = ?", parameters=[str(file.uuid)]).fetchone() assert not file3 assert not file3.get_absolute_path(files_folder_copy).is_file() - - -# noinspection DuplicatedCode -def test_extract(tests_folder: Path, files_folder: Path, files_folder_copy: Path): - database_path: Path = files_folder / "_metadata" / "files.db" - database_path_copy: Path = files_folder_copy / database_path.relative_to(files_folder) - database_path_copy.parent.mkdir(parents=True, exist_ok=True) - copy(database_path, database_path_copy) - - with FileDB(database_path_copy) as database: - files: list[File] = list(database.files.select(where="action = 'extract'")) - - app.main( - [ - command_extract.name, - str(files_folder_copy), - "--actions", - str(tests_folder / "fileformats.yml"), - "--custom-signatures", - str(tests_folder / "custom_signatures.yml"), - "--siegfried-home", - str(tests_folder), - ], - standalone_mode=False, - ) - - with FileDB(database_path_copy) as database: - for file in files: - file2 = database.files.select(where="uuid = ?", parameters=[str(file.uuid)]).fetchone() - assert file2 - if file.relative_path.name.split(".", 1)[0].endswith("-encrypted"): - assert file2.action == "ignore" - assert file2.action_data.ignore.template == "password-protected" - assert file2.processed is True - elif on_success_action := file.action_data.extract.on_success: - assert file2.action == on_success_action - else: - assert file2.action == "ignore" - assert file2.action_data.ignore.template == "extracted-archive" - assert file2.processed is True - assert database.files.select(where="parent = ?", parameters=[str(file.uuid)]).fetchone() From 2a94efceb3ca714e099517955bf57671b22b6822 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Thu, 22 Aug 2024 09:49:33 +0200 Subject: [PATCH 02/10] doctor:deduplicate_extensions - add custom function to reverse strings in case reverse is not available --- digiarch/doctor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/digiarch/doctor.py b/digiarch/doctor.py index aea83b12..ba6e5c40 100644 --- a/digiarch/doctor.py +++ b/digiarch/doctor.py @@ -62,11 +62,12 @@ def sanitize_paths(ctx: Context, database: FileDB, root: Path, dry_run: bool, *l def deduplicate_extensions(ctx: Context, database: FileDB, root: Path, dry_run: bool, *loggers: Logger | None): + database.create_function("__reverse", 1, lambda s: s[::-1]) for file in database.files.select( - where="instr(reverse(relative_path), '.') != 0" + where="instr(__reverse(relative_path), '.') != 0" " and relative_path like '%' ||" - " substr(relative_path, length(relative_path) - instr(reverse(relative_path), '.') + 1) ||" - " substr(relative_path, length(relative_path) - instr(reverse(relative_path), '.') + 1)" + " substr(relative_path, length(relative_path) - instr(__reverse(relative_path), '.') + 1) ||" + " substr(relative_path, length(relative_path) - instr(__reverse(relative_path), '.') + 1)" ): old_suffixes: list[str] = [s.lower() for s in file.suffixes.split(".") if s] new_suffixes: list[str] = [s.lower() for s in old_suffixes] From 16f388f5839bb58f9fd7ceeff8d1016fdbdb1253 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Thu, 22 Aug 2024 09:57:27 +0200 Subject: [PATCH 03/10] doctor:deduplicate_extensions - remove unnecessary middle step when deduplicating --- digiarch/doctor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/digiarch/doctor.py b/digiarch/doctor.py index ba6e5c40..e3935d17 100644 --- a/digiarch/doctor.py +++ b/digiarch/doctor.py @@ -70,9 +70,8 @@ def deduplicate_extensions(ctx: Context, database: FileDB, root: Path, dry_run: " substr(relative_path, length(relative_path) - instr(__reverse(relative_path), '.') + 1)" ): old_suffixes: list[str] = [s.lower() for s in file.suffixes.split(".") if s] - new_suffixes: list[str] = [s.lower() for s in old_suffixes] # Deduplicate suffixes - new_suffixes = sorted(set(new_suffixes), key=new_suffixes.index) + new_suffixes: list[str] = [s for s in sorted(set(old_suffixes), key=old_suffixes.index)] # Restore original letter case new_suffixes = [next(s2 for s2 in old_suffixes if s2.lower() == s) for s in new_suffixes] old_name: str = file.name From 5beded9333b997fea6d9dad15114c5488a79d8e6 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Thu, 22 Aug 2024 09:58:45 +0200 Subject: [PATCH 04/10] poetry - use acacore 3.0.5 --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index aca2f155..eac7dbd2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2,7 +2,7 @@ [[package]] name = "acacore" -version = "3.0.4" +version = "3.0.5" description = "" optional = false python-versions = "^3.11" @@ -19,8 +19,8 @@ pyyaml = "^6.0.2" [package.source] type = "git" url = "https://github.com/aarhusstadsarkiv/acacore.git" -reference = "v3.0.4" -resolved_reference = "68acebb001166a7e1246c00fae1afeeb62fc6267" +reference = "v3.0.5" +resolved_reference = "b76620779212498201c95ddc25431f860e5d6340" [[package]] name = "annotated-types" @@ -886,4 +886,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "a05d96b6e62e404ba998f36aab0d39076f59f0a533c67e9aaa1531e9c01e9fc7" +content-hash = "bd8bf0d02196fea5e636395e3a5f48914283578e01fe728dba41e6a5b59ad2a1" diff --git a/pyproject.toml b/pyproject.toml index 130bbf66..102fae7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ include = ["pyproject.toml"] [tool.poetry.dependencies] python = "^3.11" -acacore = {git = "https://github.com/aarhusstadsarkiv/acacore.git", tag = "v3.0.4"} +acacore = {git = "https://github.com/aarhusstadsarkiv/acacore.git", tag = "v3.0.5"} patool = "^2.4.0" tnefparse = "^1.4.0" extract-msg = "^0.48.7" From 8c107a2e290dac4e37638d28753d2d22d11abc46 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Thu, 22 Aug 2024 10:00:20 +0200 Subject: [PATCH 05/10] poetry - use extract-msg ^0.49.0 --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index eac7dbd2..64e41a55 100644 --- a/poetry.lock +++ b/poetry.lock @@ -346,13 +346,13 @@ files = [ [[package]] name = "extract-msg" -version = "0.48.7" +version = "0.49.0" description = "Extracts emails and attachments saved in Microsoft Outlook's .msg files" optional = false python-versions = ">=3.8" files = [ - {file = "extract_msg-0.48.7-py3-none-any.whl", hash = "sha256:0477489aa2ac417387803f19fa53ddc44136846a648b0898a114212272a1a111"}, - {file = "extract_msg-0.48.7.tar.gz", hash = "sha256:3ddf015c0e0a6ea36026fedfb7f8e434ca37150a31069363b2d0752196d15b6e"}, + {file = "extract_msg-0.49.0-py3-none-any.whl", hash = "sha256:6a1756164ef2d0c230bce1966d52155da8bd4bec9a6a1c3166cbdff8ffd9e0ba"}, + {file = "extract_msg-0.49.0.tar.gz", hash = "sha256:cc700cdcc0cb6fcdbfa3b9e477201958cc28c584716d5c45fdd47261c1f2dfcc"}, ] [package.dependencies] @@ -886,4 +886,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "bd8bf0d02196fea5e636395e3a5f48914283578e01fe728dba41e6a5b59ad2a1" +content-hash = "0446d48c93cffcffcefe8a771b89b421c995faf5704ee4addb36ca2567ea166a" diff --git a/pyproject.toml b/pyproject.toml index 102fae7d..0be6d7dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ python = "^3.11" acacore = {git = "https://github.com/aarhusstadsarkiv/acacore.git", tag = "v3.0.5"} patool = "^2.4.0" tnefparse = "^1.4.0" -extract-msg = "^0.48.7" +extract-msg = "^0.49.0" chardet = "^5.2.0" [tool.poetry.group.dev.dependencies] From 8fb6e4e026c9ecfa9870e42b52aac820cee254df Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Thu, 22 Aug 2024 10:01:14 +0200 Subject: [PATCH 06/10] tests.files:files.db - update test reference database --- tests/files/_metadata/files.db | Bin 40960 -> 40960 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/tests/files/_metadata/files.db b/tests/files/_metadata/files.db index 80eea0e57620a42237fe59656e4ff37acd6f6abd..b54dc82bfa96839e5accfe6a6662d19327744b47 100644 GIT binary patch delta 1288 zcma)*J&09D6vy}NCTrI9y{F>FkBGi37a}2NneUmi0eO-xsM{z6g2kDcGiE`bC<-Rc zS_IQB+}0{ph(#oi6he@#lR_Z`Qm0K2+jN;55NwRMo%zi<|1J))@>GGhDRKpJJBh`Tp>?71_2k!I{YJW%e z@oA+4`?&N@2X-#)yh)~g9ofgF<^Ov3aOv$1eCg57|B3WgNA{6usRR2+wD`f&<&{?j zI`%q)ew;2(mLDt(e;YoiqU;Xahu05%sxC>fDa58x8yXR9$lF)9=~nt;V>|Qg_ICMl zlV|zs4+r}e&VBsN%>^Zt0TDpGa!?A37~198nKiTi(PX{;kaXh~uWwG)^@iB6^G{|E a2M73Wx^^dbtDj$)|8?WYWMIGDJ@_vZcvpr1 delta 1258 zcma)*J7`ov6o#{jCa!UJlVIX2=o&9*;R*A8M3A*NrclAgVxCrsAQob?sbU+LRuHVj z6pD+5AR)CDrV*^IMeM}Ra|_W%z1_fk_n$NW{bqA-d~AJO)O=GNVBx>u@Crc<-b!TzlLtyH^JVj((_2PRd7SASg>g z2riLh8GLNq5JXvXuS^C5WzmlY+{ z2B|fuOausNEO){f zIJtEgq>{l%izPYcKnY2>=7}hu%Cl>OvUO^sVjeG%lmeHt0~{M`71)GR83UEV=~DTM z@U6xBm6X(AbOO$d!k?8Xj)2{!z_p258kcqXw)nadDJJiT1nO)AtUrN|g5f7NIzlBL zn##kqL1~|UU-@8@RU}{q5sOvKf+2weQ86ZMWMVFty|ux}oY|i_+5OzR-F?%0(c9`x zcfRyQ_i}f>_o=(xy)&Zycb(ydiJPq^Qgv#uiBv~wu!&I9H8^Y{)ZvqyshCM52irY$DP4`H7qDKLtAeH-lacCi{~+ zW1Y{P9TeqBr#xPtouxM@T_JXb>18AEY1*>8zHshTYijJurAODt7-0g4!6){cL*E5q p<@@@Dd9rk|zvO}ockiX8mHrZ6Cd* Date: Thu, 22 Aug 2024 10:02:10 +0200 Subject: [PATCH 07/10] tests:doctor_paths,doctor_extensions - add tests for doctor command --- tests/test_cli.py | 61 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index a941b030..a8dd6502 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -18,6 +18,7 @@ from pydantic import BaseModel from digiarch.cli import app +from digiarch.doctor import command_doctor from digiarch.edit.action import group_action from digiarch.edit.edit import group_edit from digiarch.edit.lock import command_lock @@ -190,6 +191,66 @@ def test_extract(tests_folder: Path, files_folder: Path, files_folder_copy: Path assert database.files.select(where="parent = ?", parameters=[str(file.uuid)]).fetchone() +# noinspection DuplicatedCode +def test_doctor_paths(tests_folder: Path, files_folder: Path, files_folder_copy: Path): + database_path: Path = files_folder / "_metadata" / "files.db" + database_path_copy: Path = files_folder_copy / database_path.relative_to(files_folder) + database_path_copy.parent.mkdir(parents=True, exist_ok=True) + copy(database_path, database_path_copy) + + with FileDB(database_path_copy) as database: + files: list[File] = list(database.files.select()) + for file in files: + file.root = files_folder_copy + file.relative_path = ( + file.get_absolute_path() + .rename(file.get_absolute_path().with_name("*" + file.name)) + .relative_to(files_folder_copy) + ) + database.files.update({"relative_path": file.relative_path}, {"uuid": file.uuid}) + database.commit() + + app.main([command_doctor.name, str(files_folder_copy), "--fix", "paths"], standalone_mode=False) + + with FileDB(database_path_copy) as database: + for file in files: + file2: File | None = database.files.select(where="uuid = ?", parameters=[str(file.uuid)]).fetchone() + assert file2 + assert not file.get_absolute_path(files_folder_copy).is_file() + assert file2.get_absolute_path(files_folder_copy).is_file() + assert file2.relative_path == file.relative_path.with_name(file.name.replace("*", "_")) + + +# noinspection DuplicatedCode +def test_doctor_extensions(tests_folder: Path, files_folder: Path, files_folder_copy: Path): + database_path: Path = files_folder / "_metadata" / "files.db" + database_path_copy: Path = files_folder_copy / database_path.relative_to(files_folder) + database_path_copy.parent.mkdir(parents=True, exist_ok=True) + copy(database_path, database_path_copy) + + with FileDB(database_path_copy) as database: + files: list[File] = [f for f in database.files.select() if f.suffix] + for file in files: + file.root = files_folder_copy + file.relative_path = ( + file.get_absolute_path() + .rename(file.get_absolute_path().with_name(file.name + file.suffix)) + .relative_to(files_folder_copy) + ) + database.files.update({"relative_path": file.relative_path}, {"uuid": file.uuid}) + database.commit() + + app.main([command_doctor.name, str(files_folder_copy), "--fix", "extensions"], standalone_mode=False) + + with FileDB(database_path_copy) as database: + for file in files: + file2: File | None = database.files.select(where="uuid = ?", parameters=[str(file.uuid)]).fetchone() + assert file2 + assert not file.get_absolute_path(files_folder_copy).is_file() + assert file2.get_absolute_path(files_folder_copy).is_file() + assert file2.relative_path == file.relative_path.with_name(file.name.removesuffix(file.suffix)) + + # noinspection DuplicatedCode def test_history(tests_folder: Path, files_folder: Path): app.main( From 9b7e423298c841a83c9bf342e01a5e2ca7214bbc Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Thu, 22 Aug 2024 10:03:22 +0200 Subject: [PATCH 08/10] doctor:deduplicate_extensions - remove unnecessary list comprehension --- digiarch/doctor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digiarch/doctor.py b/digiarch/doctor.py index e3935d17..eec26183 100644 --- a/digiarch/doctor.py +++ b/digiarch/doctor.py @@ -71,7 +71,7 @@ def deduplicate_extensions(ctx: Context, database: FileDB, root: Path, dry_run: ): old_suffixes: list[str] = [s.lower() for s in file.suffixes.split(".") if s] # Deduplicate suffixes - new_suffixes: list[str] = [s for s in sorted(set(old_suffixes), key=old_suffixes.index)] + new_suffixes: list[str] = sorted(set(old_suffixes), key=old_suffixes.index) # Restore original letter case new_suffixes = [next(s2 for s2 in old_suffixes if s2.lower() == s) for s in new_suffixes] old_name: str = file.name From 0b41b24b1e0d4338792214b17b4756b8135715bb Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Thu, 22 Aug 2024 10:04:51 +0200 Subject: [PATCH 09/10] version - patch 3.2.4 > 3.2.5 --- digiarch/__version__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/digiarch/__version__.py b/digiarch/__version__.py index 79e4386b..67fc7e83 100644 --- a/digiarch/__version__.py +++ b/digiarch/__version__.py @@ -1 +1 @@ -__version__ = "3.2.4" +__version__ = "3.2.5" diff --git a/pyproject.toml b/pyproject.toml index 0be6d7dc..3682755c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "digiarch" -version = "3.2.4" +version = "3.2.5" description = "Tools for the Digital Archive Project at Aarhus Stadsarkiv" authors = ["Aryan Muhammadi Landi ", "Nina Jensen ", "Aarhus Stadsarkiv "] license = "GPL-3.0" From bf162fa2e9be9e56d36a2ad4391aea145c304d28 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Thu, 22 Aug 2024 10:06:55 +0200 Subject: [PATCH 10/10] pyproject - use only stadsarkiv@aarhus.dk as author --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3682755c..9430dfe4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "digiarch" version = "3.2.5" description = "Tools for the Digital Archive Project at Aarhus Stadsarkiv" -authors = ["Aryan Muhammadi Landi ", "Nina Jensen ", "Aarhus Stadsarkiv "] +authors = ["Aarhus Stadsarkiv "] license = "GPL-3.0" readme = "README.md" homepage = "https://stadsarkiv.aarhus.dk/"