Skip to content

Commit

Permalink
commands.extract.extractors:msg - add a prefix with the attachment in…
Browse files Browse the repository at this point in the history
…dex if there are multiples of the same name
  • Loading branch information
MatteoCampinoti94 committed Jan 8, 2025
1 parent 27e6071 commit 0a9e910
Showing 1 changed file with 13 additions and 4 deletions.
17 changes: 13 additions & 4 deletions digiarch/commands/extract/extractors/extractor_msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ def msg_attachments(
return inline_attachments, attachments


def prepare_attachment_name(names: list[str], name: str, n: int) -> [tuple[str], str, str]:
"""Deduplicate attachment name by attaching a prefix to the sanitized name with the index of that name if it has already been extracted."""
name = name.strip() or f"attachment-{n}"
name_sanitized: str = sanitize_filename(name, 20, True).strip("_") or f"attachment-{n}"
names.append(name_sanitized.lower())
if (count := names.count(name_sanitized.lower())) > 1:
name_sanitized = f"{count - 1}_{name_sanitized}"
return names, name, name_sanitized


class MsgExtractor(ExtractorBase):
tool_names: ClassVar[list[str]] = ["msg"]

Expand All @@ -120,11 +130,11 @@ def extract(self) -> list[tuple[Path, Path]]:
inline_attachments, attachments = msg_attachments(msg, body_html, body_rtf)

with TempDir(self.file.root) as tmp_dir:
names: list[str] = []
for n, attachment in enumerate(inline_attachments + attachments):
if isinstance(attachment, (Message, MessageSigned)):
name: str = (attachment.filename or "").strip() or (attachment.subject or "").strip()
name = name.strip() or f"attachment-{n}"
name_sanitized: str = sanitize_filename(name, 20, True).strip("_") or f"attachment-{n}"
names, name, name_sanitized = prepare_attachment_name(names, name, n)
attachment.export(tmp_dir / name_sanitized)
files.append((name_sanitized, name))
elif isinstance(attachment.data, bytes):
Expand All @@ -133,8 +143,7 @@ def extract(self) -> list[tuple[Path, Path]]:
if isinstance(attachment, Attachment)
else attachment.longFilename or ""
)
name = name.strip() or f"attachment-{n}"
name_sanitized: str = sanitize_filename(name, 20, True).strip("_") or f"attachment-{n}"
names, name, name_sanitized = prepare_attachment_name(names, name, n)
with tmp_dir.joinpath(name_sanitized).open("wb") as fh:
fh.write(attachment.data or b"")
files.append((name_sanitized, name))
Expand Down

0 comments on commit 0a9e910

Please sign in to comment.