From 0a9e91089487cf081eebae5273fa39e4ec22409c Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Wed, 8 Jan 2025 14:02:00 +0100 Subject: [PATCH] commands.extract.extractors:msg - add a prefix with the attachment index if there are multiples of the same name --- .../extract/extractors/extractor_msg.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/digiarch/commands/extract/extractors/extractor_msg.py b/digiarch/commands/extract/extractors/extractor_msg.py index c74167de..a346bea2 100644 --- a/digiarch/commands/extract/extractors/extractor_msg.py +++ b/digiarch/commands/extract/extractors/extractor_msg.py @@ -108,6 +108,16 @@ def msg_attachments( return inline_attachments, attachments +def prepare_attachment_name(names: list[str], name: str, n: int) -> [tuple[str], str, str]: + """Deduplicate attachment name by attaching a prefix to the sanitized name with the index of that name if it has already been extracted.""" + name = name.strip() or f"attachment-{n}" + name_sanitized: str = sanitize_filename(name, 20, True).strip("_") or f"attachment-{n}" + names.append(name_sanitized.lower()) + if (count := names.count(name_sanitized.lower())) > 1: + name_sanitized = f"{count - 1}_{name_sanitized}" + return names, name, name_sanitized + + class MsgExtractor(ExtractorBase): tool_names: ClassVar[list[str]] = ["msg"] @@ -120,11 +130,11 @@ def extract(self) -> list[tuple[Path, Path]]: inline_attachments, attachments = msg_attachments(msg, body_html, body_rtf) with TempDir(self.file.root) as tmp_dir: + names: list[str] = [] for n, attachment in enumerate(inline_attachments + attachments): if isinstance(attachment, (Message, MessageSigned)): name: str = (attachment.filename or "").strip() or (attachment.subject or "").strip() - name = name.strip() or f"attachment-{n}" - name_sanitized: str = sanitize_filename(name, 20, True).strip("_") or f"attachment-{n}" + names, name, name_sanitized = prepare_attachment_name(names, name, n) attachment.export(tmp_dir / name_sanitized) files.append((name_sanitized, name)) elif isinstance(attachment.data, bytes): @@ -133,8 +143,7 @@ def extract(self) -> list[tuple[Path, Path]]: if isinstance(attachment, Attachment) else attachment.longFilename or "" ) - name = name.strip() or f"attachment-{n}" - name_sanitized: str = sanitize_filename(name, 20, True).strip("_") or f"attachment-{n}" + names, name, name_sanitized = prepare_attachment_name(names, name, n) with tmp_dir.joinpath(name_sanitized).open("wb") as fh: fh.write(attachment.data or b"") files.append((name_sanitized, name))