diff --git a/README b/README index af05d07..c968f3a 100644 --- a/README +++ b/README @@ -7,6 +7,8 @@ Overview mail-parser is a wrapper for `email`_ Python Standard Library. It’s the key module of `SpamScope`_. +From version 1.0.0rc1 mail-parser supports Python 3. + Description ----------- diff --git a/README.md b/README.md index 9c54e18..fba8f96 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ mail-parser is a wrapper for [email](https://docs.python.org/2/library/email.message.html) Python Standard Library. It's the key module of [SpamScope](https://github.com/SpamScope/spamscope). +From version 1.0.0rc1 mail-parser supports Python 3. + ## Description mail-parser takes as input a raw mail and generates a parsed object. This object is a tokenized mail with the all parts of mail and some indicator: diff --git a/mailparser/__init__.py b/mailparser/__init__.py index 4a66015..fcebedc 100644 --- a/mailparser/__init__.py +++ b/mailparser/__init__.py @@ -17,342 +17,4 @@ limitations under the License. """ -from __future__ import unicode_literals -from .exceptions import InvalidMail, NotUnicodeError -from email.errors import HeaderParseError -from email.header import decode_header -import datetime -import email -import ipaddress -import logging -import re -import time - -try: - import simplejson as json -except ImportError: - import json - -log = logging.getLogger(__name__) - - -class MailParser(object): - """Tokenizer for raw mails. """ - - def __init__(self): - # With these defect bad payload is on epilogue - self._epilogue_defects = set([ - "StartBoundaryNotFoundDefect"]) - - def parse_from_file(self, fd): - """Parsing mail from file. """ - - with open(fd) as mail: - self._message = email.message_from_file(mail) - self._parse() - - def parse_from_string(self, s): - """Parsing mail from string. """ - - self._message = email.message_from_string(s) - self._parse() - - def _decode_header_part(self, header): - output = u'' - - try: - for i in decode_header(header): - if i[1]: - output += unicode(i[0], i[1], errors='ignore').strip() - else: - output += unicode(i[0], errors='ignore').strip() - - # Header parsing failed, when header has charset Shift_JIS - except HeaderParseError: - log.error("Failed decoding header part: {}".format(header)) - output += header - - if not isinstance(output, unicode): - raise NotUnicodeError("Header part is not unicode") - - return output - - def _force_unicode(self, string, encoding): - try: - u = unicode(string, encoding=encoding, errors='ignore') - except: - u = unicode(string, errors='ignore',) - - if not isinstance(u, unicode): - raise NotUnicodeError("Body part is not unicode") - - return u - - def _append_defects(self, part, part_content_type): - part_defects = {} - - for e in part.defects: - defects = "{}: {}".format(e.__class__.__name__, e.__doc__) - self._defects_category.add(e.__class__.__name__) - - if part_defects: - part_defects[part_content_type].append(defects) - else: - part_defects[part_content_type] = [defects] - - # Tag mail with defect - if part_defects: - self._has_defects = True - - # Save all defects - self._defects.append(part_defects) - - def _reset(self): - self._attachments = list() - self._text_plain = list() - self._defects = list() - self._defects_category = set() - self._has_defects = False - self._has_anomalies = False - self._anomalies = list() - - def _make_mail(self): - self._mail = { - "attachments": self.attachments_list, - "body": self.body, - "date": self.date_mail, - "from": self.from_, - "headers": self.headers, - "message_id": self.message_id, - "subject": self.subject, - "to": self.to_, - "has_defects": self._has_defects, - "has_anomalies": self._has_anomalies, - } - - def _parse(self): - if not self._message.keys(): - raise InvalidMail("Mail without headers: {}".format( - self._message.as_string())) - - # Reset for new mail - self._reset() - parts = list() # Normal parts plus defects - - # walk all mail parts to search defects - for p in self._message.walk(): - part_content_type = p.get_content_type() - self._append_defects(p, part_content_type) - parts.append(p) - - # If defects are in epilogue defects get epilogue - if self._epilogue_defects & self._defects_category: - epilogue = self.find_between( - self._message.epilogue, - "{}".format("--" + self._message.get_boundary()), - "{}".format("--" + self._message.get_boundary() + "--")) - - try: - p = email.message_from_string(epilogue) - parts.append(p) - except: - log.error("Failed to get epilogue part") - - # walk all mail parts - for p in parts: - if not p.is_multipart(): - f = p.get_filename() - charset = p.get_content_charset('utf-8') - - if f: - filename = self._decode_header_part(f) - mail_content_type = self._decode_header_part( - p.get_content_type()) - transfer_encoding = \ - unicode(p.get('content-transfer-encoding', '')).lower() - - if transfer_encoding == "base64": - payload = p.get_payload(decode=False) - else: - payload = self._force_unicode( - string=p.get_payload(decode=True), - encoding=charset) - - self._attachments.append( - { - "filename": filename, - "payload": payload, - "mail_content_type": mail_content_type, - "content_transfer_encoding": transfer_encoding, - } - ) - else: - payload = self._force_unicode( - string=p.get_payload(decode=True), - encoding=charset) - self._text_plain.append(payload) - - # Parsed object mail - self._make_mail() - - # Add defects - if self.has_defects: - self._mail["defects"] = self.defects - self._mail["defects_category"] = list(self._defects_category) - - # Add anomalies - if self.has_anomalies: - self._mail["anomalies"] = self.anomalies - self._mail["has_anomalies"] = True - - def find_between(self, text, first_token, last_token): - try: - start = text.index(first_token) + len(first_token) - end = text.index(last_token, start) - return text[start:end].strip() - except ValueError: - return - - def get_server_ipaddress(self, trust): - """ Return ip address of sender - - Extract a reliable sender IP address heuristically for each message. - Although the message format dictates a chain of relaying IP - addresses in each message, a malicious relay can easily alter that. - Therefore we cannot simply take the first IP in - the chain. Instead, our method is as follows. - First we trust the sender IP reported by our mail server in the - Received headers, and if the previous relay IP address is on our trust - list (e.g. other well-known mail services), we continue to - follow the previous Received line, till we reach the first unrecognized - IP address in the email header. - - From article Characterizing Botnets from Email Spam Records: - Li Zhuang, J. D. Tygar - - In our case we trust only our mail server with the trust string. - - - Keyword arguments: - trust -- String that identify our mail server - """ - - received = self._message.get_all("received", []) - r = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}") - - for i in received: - if trust in i: - check = r.findall(i[0:i.find("by")]) - if check: - try: - ip = ipaddress.ip_address(unicode(check[-1])) - except ValueError: - return - - if not ip.is_private: - return unicode(check[-1]) - - @property - def body(self): - return "\n".join(self.text_plain_list) - - @property - def headers(self): - s = "" - for k, v in self._message.items(): - v_u = self._decode_header_part(v) - s += k + " " + v_u + "\n" - return s - - @property - def message_id(self): - message_id = self._message.get('message-id', None) - if not message_id: - self._anomalies.append('mail_without_message-id') - return None - else: - return self._decode_header_part(message_id) - - @property - def to_(self): - return self._decode_header_part( - self._message.get('to', self._message.get('delivered-to'))) - - @property - def from_(self): - return self._decode_header_part( - self._message.get('from')) - - @property - def subject(self): - return self._decode_header_part( - self._message.get('subject')) - - @property - def text_plain_list(self): - return self._text_plain - - @property - def attachments_list(self): - return self._attachments - - @property - def date_mail(self): - date_ = self._message.get('date') - - if not date_: - self._anomalies.append('mail_without_date') - return None - - try: - d = email.utils.parsedate(date_) - t = time.mktime(d) - return datetime.datetime.utcfromtimestamp(t) - except: - return None - - @property - def parsed_mail_obj(self): - return self._mail - - @property - def parsed_mail_json(self): - self._mail["date"] = self.date_mail.isoformat() \ - if self.date_mail else "" - return json.dumps( - self._mail, - ensure_ascii=False, - indent=None) - - @property - def defects(self): - """The defects property contains a list of - all the problems found when parsing this message. - """ - return self._defects - - @property - def defects_category(self): - """Return a list with only defects categories. """ - return self._defects_category - - @property - def has_defects(self): - """Boolean: True if mail has defects. """ - return self._has_defects - - @property - def anomalies(self): - """The anomalies property contains a list of - all anomalies in mail: - - mail_without_date - - mail_without_message-id - """ - return self._anomalies - - @property - def has_anomalies(self): - if self.anomalies: - return True - else: - return False +from .mailparser import * diff --git a/mailparser/__main__.py b/mailparser/__main__.py index dc14c5e..b3c5c54 100755 --- a/mailparser/__main__.py +++ b/mailparser/__main__.py @@ -131,6 +131,13 @@ def get_args(): return parser.parse_args() +def safe_print(data): + try: + print(data) + except UnicodeEncodeError: + print(data.encode('utf-8')) + + def main(): args = get_args() @@ -143,41 +150,42 @@ def main(): if args.json: j = json.loads(parser.parsed_mail_json) - print(json.dumps(j, ensure_ascii=False, indent=4).encode('utf-8')) + safe_print(json.dumps(j, ensure_ascii=False, indent=4)) if args.body: - print(parser.body.encode('utf-8')) + # safe_print(parser.body) + safe_print(parser.body) if args.headers: - print(parser.headers.encode('utf-8')) + safe_print(parser.headers) if args.to: - print(parser.to_.encode('utf-8')) + safe_print(parser.to_) if args.from_: - print(parser.from_.encode('utf-8')) + safe_print(parser.from_) if args.subject: - print(parser.subject.encode('utf-8')) + safe_print(parser.subject) if args.defects: for i in parser.defects_category: - print(i.encode('utf-8')) + safe_print(i) if args.anomalies: for i in parser.anomalies: - print(i.encode('utf-8')) + safe_print(i) if args.senderip: r = parser.get_server_ipaddress(args.senderip) if r: - print(r.encode('utf-8')) + safe_print(r) else: - print("Not Found") + safe_print("Not Found") if args.attachments: for i in parser.attachments_list: - print(json.dumps(i, ensure_ascii=False, indent=4).encode('utf-8')) + safe_print(json.dumps(i, ensure_ascii=False, indent=4)) if __name__ == '__main__': diff --git a/mailparser/exceptions.py b/mailparser/exceptions.py deleted file mode 100644 index 7ae100c..0000000 --- a/mailparser/exceptions.py +++ /dev/null @@ -1,9 +0,0 @@ -__all__ = ['InvalidMail', 'NotUnicodeError'] - - -class InvalidMail(ValueError): - pass - - -class NotUnicodeError(ValueError): - pass diff --git a/mailparser/mailparser.py b/mailparser/mailparser.py new file mode 100644 index 0000000..37d6031 --- /dev/null +++ b/mailparser/mailparser.py @@ -0,0 +1,374 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Fedele Mantuano (https://twitter.com/fedelemantuano) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from __future__ import unicode_literals +from email.errors import HeaderParseError +from email.header import decode_header +import datetime +import email +import ipaddress +import logging +import re +import six +import time + +try: + import simplejson as json +except ImportError: + import json + +log = logging.getLogger(__name__) + + +class InvalidMail(ValueError): + pass + + +def ported_string(raw_data, encoding='utf-8', errors='ignore'): + """ Give as input raw data and output a str in Python 3 + and unicode in Python 2. + + Args: + raw_data: Python 2 str, Python 3 bytes or str to porting + encoding: string giving the name of an encoding + errors: his specifies the treatment of characters + which are invalid in the input encoding + + Returns: + str (Python 3) or unicode (Python 2) + """ + + if not raw_data: + return six.text_type() + + if six.PY2: + return six.text_type(raw_data, encoding, errors).strip() + + elif six.PY3: + if isinstance(raw_data, str): + return raw_data.strip() + else: + return six.text_type(raw_data, encoding).strip() + + +def decode_header_part(header): + output = six.text_type() + + try: + for d, c in decode_header(header): + c = c if c else 'utf-8' + output += ported_string(d, c, 'ignore') + + # Header parsing failed, when header has charset Shift_JIS + except HeaderParseError: + log.error("Failed decoding header part: {}".format(header)) + output += header + + return output + + +def ported_open(file_): + if six.PY2: + return open(file_) + elif six.PY3: + return open(file_, errors='ignore') + + +def find_between(text, first_token, last_token): + try: + start = text.index(first_token) + len(first_token) + end = text.index(last_token, start) + return text[start:end].strip() + except ValueError: + return + + +class MailParser(object): + """Tokenizer for raw mails. """ + + # With these defect bad payload is on epilogue + epilogue_defects = {"StartBoundaryNotFoundDefect"} + + def parse_from_file(self, fd): + """Parsing mail from file. """ + + # with open(fd, encoding='utf-8', errors='ignore') as mail: + with ported_open(fd) as mail: + self._message = email.message_from_file(mail) + self._parse() + + def parse_from_string(self, s): + """Parsing mail from string. """ + + self._message = email.message_from_string(s) + self._parse() + + def _append_defects(self, part, part_content_type): + part_defects = {} + + for e in part.defects: + defects = "{}: {}".format(e.__class__.__name__, e.__doc__) + self._defects_category.add(e.__class__.__name__) + + if part_defects: + part_defects[part_content_type].append(defects) + else: + part_defects[part_content_type] = [defects] + + # Tag mail with defect + if part_defects: + self._has_defects = True + + # Save all defects + self._defects.append(part_defects) + + def _reset(self): + self._attachments = list() + self._text_plain = list() + self._defects = list() + self._defects_category = set() + self._has_defects = False + self._has_anomalies = False + self._anomalies = list() + + def _make_mail(self): + self._mail = { + "attachments": self.attachments_list, + "body": self.body, + "date": self.date_mail, + "from": self.from_, + "headers": self.headers, + "message_id": self.message_id, + "subject": self.subject, + "to": self.to_, + "has_defects": self._has_defects, + "has_anomalies": self._has_anomalies, + } + + def _parse(self): + if not self._message.keys(): + raise InvalidMail("Mail without headers: {}".format( + self._message.as_string())) + + # Reset for new mail + self._reset() + parts = list() # Normal parts plus defects + + # walk all mail parts to search defects + for p in self._message.walk(): + part_content_type = p.get_content_type() + self._append_defects(p, part_content_type) + parts.append(p) + + # If defects are in epilogue defects get epilogue + if self.epilogue_defects & self._defects_category: + epilogue = find_between( + self._message.epilogue, + "{}".format("--" + self._message.get_boundary()), + "{}".format("--" + self._message.get_boundary() + "--")) + + try: + p = email.message_from_string(epilogue) + parts.append(p) + except: + log.error("Failed to get epilogue part") + + # walk all mail parts + for p in parts: + if not p.is_multipart(): + filename = ported_string(p.get_filename()) + charset = p.get_content_charset('utf-8') + + if filename: + mail_content_type = ported_string(p.get_content_type()) + transfer_encoding = ported_string( + p.get('content-transfer-encoding', '')).lower() + + if transfer_encoding == "base64": + payload = p.get_payload(decode=False) + else: + payload = ported_string( + p.get_payload(decode=True), encoding=charset) + + self._attachments.append( + { + "filename": filename, + "payload": payload, + "mail_content_type": mail_content_type, + "content_transfer_encoding": transfer_encoding, + } + ) + else: + payload = ported_string( + p.get_payload(decode=True), encoding=charset) + if payload: + self._text_plain.append(payload) + + # Parsed object mail + self._make_mail() + + # Add defects + if self.has_defects: + self._mail["defects"] = self.defects + self._mail["defects_category"] = list(self._defects_category) + + # Add anomalies + if self.has_anomalies: + self._mail["anomalies"] = self.anomalies + self._mail["has_anomalies"] = True + + def get_server_ipaddress(self, trust): + """ Return ip address of sender + + Extract a reliable sender IP address heuristically for each message. + Although the message format dictates a chain of relaying IP + addresses in each message, a malicious relay can easily alter that. + Therefore we cannot simply take the first IP in + the chain. Instead, our method is as follows. + First we trust the sender IP reported by our mail server in the + Received headers, and if the previous relay IP address is on our trust + list (e.g. other well-known mail services), we continue to + follow the previous Received line, till we reach the first unrecognized + IP address in the email header. + + From article Characterizing Botnets from Email Spam Records: + Li Zhuang, J. D. Tygar + + In our case we trust only our mail server with the trust string. + + + Keyword arguments: + trust -- String that identify our mail server + """ + + received = self._message.get_all("received", []) + r = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}") + + for i in received: + if trust in i: + check = r.findall(i[0:i.find("by")]) + if check: + try: + ip = ipaddress.ip_address(six.text_type(check[-1])) + except ValueError: + return + + if not ip.is_private: + return six.text_type(check[-1]) + + @property + def body(self): + # print(self.text_plain_list) + return "\n".join(self.text_plain_list) + + @property + def headers(self): + s = "" + for k, v in self._message.items(): + v_u = decode_header_part(v) + s += k + " " + v_u + "\n" + return s + + @property + def message_id(self): + message_id = self._message.get('message-id', None) + if not message_id: + self._anomalies.append('mail_without_message-id') + return None + else: + return ported_string(message_id) + + @property + def to_(self): + return decode_header_part( + self._message.get('to', self._message.get('delivered-to'))) + + @property + def from_(self): + return decode_header_part( + self._message.get('from')) + + @property + def subject(self): + return decode_header_part( + self._message.get('subject')) + + @property + def text_plain_list(self): + return self._text_plain + + @property + def attachments_list(self): + return self._attachments + + @property + def date_mail(self): + date_ = self._message.get('date') + + if not date_: + self._anomalies.append('mail_without_date') + return None + + try: + d = email.utils.parsedate(date_) + t = time.mktime(d) + return datetime.datetime.utcfromtimestamp(t) + except: + return None + + @property + def parsed_mail_obj(self): + return self._mail + + @property + def parsed_mail_json(self): + self._mail["date"] = self.date_mail.isoformat() \ + if self.date_mail else "" + return json.dumps( + self._mail, ensure_ascii=False, indent=None) + + @property + def defects(self): + """The defects property contains a list of + all the problems found when parsing this message. + """ + return self._defects + + @property + def defects_category(self): + """Return a list with only defects categories. """ + return self._defects_category + + @property + def has_defects(self): + """Boolean: True if mail has defects. """ + return self._has_defects + + @property + def anomalies(self): + """The anomalies property contains a list of + all anomalies in mail: + - mail_without_date + - mail_without_message-id + """ + return self._anomalies + + @property + def has_anomalies(self): + return True if self.anomalies else False diff --git a/mailparser_version/__init__.py b/mailparser_version/__init__.py index 3d18726..6c93125 100644 --- a/mailparser_version/__init__.py +++ b/mailparser_version/__init__.py @@ -1 +1 @@ -__version__ = "0.5.0" +__version__ = "1.0.0rc1" diff --git a/requirements.txt b/requirements.txt index ddc3cf6..ded39e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ ipaddress==1.0.17 simplejson==3.10.0 +six==1.10.0 diff --git a/setup.py b/setup.py index de4a520..be5f0b8 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,13 @@ "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.6", "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.0", + "Programming Language :: Python :: 3.1", + "Programming Language :: Python :: 3.2", + "Programming Language :: Python :: 3.3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", ], install_requires=requires, entry_points={'console_scripts': [ diff --git a/tests/test_mail_parser.py b/tests/test_mail_parser.py index 5153cc4..e82e40f 100644 --- a/tests/test_mail_parser.py +++ b/tests/test_mail_parser.py @@ -19,6 +19,7 @@ import datetime import os +import six import sys import unittest @@ -45,10 +46,7 @@ def test_valid_mail(self): def test_valid_date_mail(self): parser = mailparser.MailParser() parser.parse_from_file(mail_test_1), - self.assertIn( - "mail_without_date", - parser.anomalies, - ) + self.assertIn("mail_without_date", parser.anomalies) def test_parsing_know_values(self): parser = mailparser.MailParser() @@ -84,9 +82,7 @@ def test_parsing_know_values(self): self.assertEqual(3, result) raw = "Sun, 29 Nov 2015 09:45:18 +0100" - raw_utc = datetime.datetime( - 2015, 11, 29, 8, 45, 18, 0 - ).isoformat() + raw_utc = datetime.datetime(2015, 11, 29, 8, 45, 18, 0).isoformat() result = parser.date_mail.isoformat() self.assertEqual(raw_utc, result) @@ -103,31 +99,31 @@ def test_types(self): self.assertNotIn("anomalies", result) result = parser.get_server_ipaddress(trust) - self.assertIsInstance(result, unicode) + self.assertIsInstance(result, six.text_type) result = parser.parsed_mail_json - self.assertIsInstance(result, unicode) + self.assertIsInstance(result, six.text_type) result = parser.headers - self.assertIsInstance(result, unicode) + self.assertIsInstance(result, six.text_type) result = parser.body - self.assertIsInstance(result, unicode) + self.assertIsInstance(result, six.text_type) result = parser.date_mail self.assertIsInstance(result, datetime.datetime) result = parser.from_ - self.assertIsInstance(result, unicode) + self.assertIsInstance(result, six.text_type) result = parser.to_ - self.assertIsInstance(result, unicode) + self.assertIsInstance(result, six.text_type) result = parser.subject - self.assertIsInstance(result, unicode) + self.assertIsInstance(result, six.text_type) result = parser.message_id - self.assertIsInstance(result, unicode) + self.assertIsInstance(result, six.text_type) result = parser.attachments_list self.assertIsInstance(result, list) @@ -150,13 +146,23 @@ def test_defects_anomalies(self): self.assertEqual(1, len(parser.defects_category)) self.assertIn("defects", parser.parsed_mail_obj) self.assertIn("StartBoundaryNotFoundDefect", parser.defects_category) - self.assertIsInstance(parser.parsed_mail_json, unicode) + self.assertIsInstance(parser.parsed_mail_json, six.text_type) result = len(parser.attachments_list) self.assertEqual(1, result) parser.parse_from_file(mail_test_1) - self.assertEqual(False, parser.has_defects) + if six.PY2: + self.assertEqual(False, parser.has_defects) + self.assertNotIn("defects", parser.parsed_mail_obj) + elif six.PY3: + self.assertEqual(True, parser.has_defects) + self.assertEqual(1, len(parser.defects)) + self.assertEqual(1, len(parser.defects_category)) + self.assertIn("defects", parser.parsed_mail_obj) + self.assertIn( + "CloseBoundaryNotFoundDefect", parser.defects_category) + self.assertEqual(True, parser.has_anomalies) self.assertEqual(2, len(parser.anomalies)) self.assertIn("anomalies", parser.parsed_mail_obj) @@ -170,7 +176,7 @@ def test_defects_bug(self): self.assertEqual(1, len(parser.defects_category)) self.assertIn("defects", parser.parsed_mail_obj) self.assertIn("StartBoundaryNotFoundDefect", parser.defects_category) - self.assertIsInstance(parser.parsed_mail_json, unicode) + self.assertIsInstance(parser.parsed_mail_json, six.text_type) result = len(parser.attachments_list) self.assertEqual(0, result) @@ -183,22 +189,14 @@ def test_add_content_type(self): result = parser.parsed_mail_obj - self.assertEqual( - len(result["attachments"]), - 1 - ) + self.assertEqual(len(result["attachments"]), 1) self.assertIsInstance( - result["attachments"][0]["mail_content_type"], - unicode - ) + result["attachments"][0]["mail_content_type"], six.text_type) self.assertIsInstance( - result["attachments"][0]["payload"], - unicode - ) + result["attachments"][0]["payload"], six.text_type) self.assertEqual( result["attachments"][0]["content_transfer_encoding"], - "quoted-printable", - ) + "quoted-printable") if __name__ == '__main__':