Skip to content

Commit

Permalink
Merge pull request #2329 from gethvi/fix-csv-parser
Browse files Browse the repository at this point in the history
FIX+ENH: Time conversions for CSV parser and HTML table parser
  • Loading branch information
sebix authored May 19, 2023
2 parents 64abf29 + 6190bc0 commit 3be2460
Show file tree
Hide file tree
Showing 10 changed files with 263 additions and 77 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ CHANGELOG
- Fixed not resetting destination path statistics in the stats cache after restarting bot (Fixes [#2331](https://github.com/certtools/intelmq/issues/2331))
- Force flushing statistics if bot will sleep longer than flushing delay (Fixes [#2336](https://github.com/certtools/intelmq/issues/2336))
- `intelmq.lib.upgrages`: Fix a bug in the upgrade function for version 3.1.0 which caused an exception if a generic csv parser instance had no parameter `type` (PR#2319 by Filip Pokorný).
- `intelmq.lib.datatypes`: Adds `TimeFormat` class to be used for the `time_format` bot parameter (PR#2329 by Filip Pokorný).
- `intelmq.lib.exceptions`: Fixes a bug in `InvalidArgument` exception (PR#2329 by Filip Pokorný).
- `intelmq.lib.harmonization`: Changes signature and names of `DateTime` conversion functions for consistency, backwards compatible (PR#2329 by Filip Pokorný).

### Development

Expand All @@ -31,6 +34,8 @@ CHANGELOG
- Added 'Accessible-SIP' report. (PR#2348)
- Added 'IPv6-Open-HTTP-Proxy' and 'IPv6-Accessible-HTTP-Proxy' aliases. (PR#2348)
- Removed duplicate mappings from the 'Spam-URL' report. (PR#2348)
- `intelmq.bots.parsers.generic.parser_csv`: Changes `time_format` parameter to use new `TimeFormat` class (PR#2329 by Filip Pokorný).
- `intelmq.bots.parsers.html_table.parser`: Changes `time_format` parameter to use new `TimeFormat` class (PR#2329 by Filip Pokorný).

#### Experts
- `intelmq.bots.experts.sieve`:
Expand Down
2 changes: 1 addition & 1 deletion intelmq/bots/experts/sieve/expert.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def process_bool_match(self, key, op, value, event):
return self._bool_op_map[op](event[key], value)

def compute_basic_math(self, action, event) -> str:
date = DateTime.parse_utc_isoformat(event[action.key], True)
date = DateTime.from_isoformat(event[action.key], True)
delta = datetime.timedelta(minutes=parse_relative(action.value))

return self._basic_math_op_map[action.operator](date, delta).isoformat()
Expand Down
4 changes: 2 additions & 2 deletions intelmq/bots/parsers/abusech/parser_feodotracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def parse_line(self, line, report):
if line.get("first_seen"):
try:
event.add("time.source",
str(DateTime.convert_from_format(value=line.get("first_seen"), format="%Y-%m-%d %H:%M:%S")),
str(DateTime.from_format(value=line.get("first_seen"), format="%Y-%m-%d %H:%M:%S")),
raise_failure=False)

except ValueError:
Expand All @@ -48,7 +48,7 @@ def parse_line(self, line, report):
elif line.get("last_online"):
try:
event.add("time.source",
str(DateTime.convert_from_format_midnight(line.get("last_online"), format="%Y-%m-%d")),
str(DateTime.from_format_midnight(line.get("last_online"), format="%Y-%m-%d")),
raise_failure=False)
except ValueError:
self.logger.warning("Failed to parse '%s' to DateTime.", line.get('last_online'))
Expand Down
22 changes: 4 additions & 18 deletions intelmq/bots/parsers/generic/parser_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,11 @@
import re
from typing import Optional, Union, Iterable

from dateutil.parser import parse

from intelmq.lib import utils
from intelmq.lib.bot import ParserBot
from intelmq.lib.exceptions import InvalidArgument, InvalidValue
from intelmq.lib.harmonization import DateTime
from intelmq.lib.utils import RewindableFileHandle

TIME_CONVERSIONS = {'timestamp': DateTime.from_timestamp,
'windows_nt': DateTime.from_windows_nt,
'epoch_millis': DateTime.from_epoch_millis,
None: lambda value: parse(value, fuzzy=True).isoformat() + " UTC"}
from intelmq.lib.datatypes import TimeFormat

DATA_CONVERSIONS = {'json': lambda data: json.loads(data)}
DOCS = "https://intelmq.readthedocs.io/en/latest/guides/Bots.html#generic-csv-parser"
Expand All @@ -49,7 +42,7 @@ class GenericCsvParserBot(ParserBot):
delimiter: str = ','
filter_text = None
filter_type = None
time_format = None
time_format: Optional[TimeFormat] = None
type: Optional[str] = None
type_translation = {}
skip_header: Union[bool, int] = False
Expand All @@ -67,14 +60,8 @@ def init(self):

# prevents empty strings:
self.column_regex_search = self.column_regex_search or {}
self.time_format = TimeFormat(self.time_format)

# handle empty strings, false etc.
if not self.time_format:
self.time_format = None
if self.time_format not in TIME_CONVERSIONS.keys():
raise InvalidArgument('time_format', got=self.time_format,
expected=list(TIME_CONVERSIONS.keys()),
docs=DOCS)
if self.filter_type and self.filter_type not in ('blacklist', 'whitelist'):
raise InvalidArgument('filter_type', got=self.filter_type,
expected=("blacklist", "whitelist"),
Expand Down Expand Up @@ -137,7 +124,6 @@ def parse_line(self, row: list, report):
if search:
value = search.group(0)
else:
type = None
value = None

if key in ("__IGNORE__", ""):
Expand All @@ -147,7 +133,7 @@ def parse_line(self, row: list, report):
value = DATA_CONVERSIONS[self.data_type[key]](value)

if key in ("time.source", "time.destination"):
value = TIME_CONVERSIONS[self.time_format](value)
value = self.time_format.parse_datetime(value)
elif key.endswith('.url'):
if not value:
continue
Expand Down
14 changes: 5 additions & 9 deletions intelmq/bots/parsers/html_table/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
time_format: string
type: string
"""
from typing import Optional

from intelmq.lib import utils
from intelmq.lib.bot import ParserBot
from intelmq.lib.exceptions import InvalidArgument
from intelmq.lib.harmonization import DateTime
from intelmq.lib.exceptions import MissingDependencyError
from intelmq.lib.datatypes import TimeFormat


try:
Expand All @@ -46,7 +46,7 @@ class HTMLTableParserBot(ParserBot):
split_index = 0
split_separator = None
table_index = 0
time_format = None
time_format: Optional[TimeFormat] = None
type = "c2-server"
_parser = 'html.parser'

Expand All @@ -69,11 +69,7 @@ def init(self):
self.attr_value = self.attribute_value
self.skip_head = self.skip_table_head
self.skip_row = 1 if self.skip_head else 0

if self.time_format and self.time_format.split('|')[0] not in DateTime.TIME_CONVERSIONS.keys():
raise InvalidArgument('time_format', got=self.time_format,
expected=list(DateTime.TIME_CONVERSIONS.keys()),
docs='https://intelmq.readthedocs.io/en/latest/guides/Bots.html#html-table-parser')
self.time_format = TimeFormat(self.time_format)

def process(self):
report = self.receive_message()
Expand Down Expand Up @@ -119,7 +115,7 @@ def process(self):
data = int(data)
except ValueError:
pass
data = DateTime.convert(data, format=self.time_format)
data = self.time_format.parse_datetime(data)

elif key.endswith('.url'):
if not data:
Expand Down
84 changes: 81 additions & 3 deletions intelmq/lib/datatypes.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
# SPDX-FileCopyrightText: 2021 Birger Schacht
#
# SPDX-License-Identifier: AGPL-3.0-or-later

from datetime import datetime
from enum import Enum
from inspect import signature
from typing import Optional, Callable, Union, List

from termstyle import green
import json

from intelmq.lib.exceptions import InvalidArgument
from intelmq.lib.harmonization import DateTime


class BotType(str, Enum):
Expand Down Expand Up @@ -40,7 +45,6 @@ def toJson(self):
'restarting': 'Restarting %s...',
}


ERROR_MESSAGES = {
'starting': 'Bot %s failed to START.',
'running': 'Bot %s is still running.',
Expand All @@ -59,3 +63,77 @@ class LogLevel(Enum):
WARNING = 2
ERROR = 3
CRITICAL = 4


class TimeFormat(str):
"""
Pydantic style Field Type class for bot parameter time_format. Used for validation.
"""

def __new__(cls, value: Optional[str] = None):
"""
Because str is immutable and we want to manipulate it, it must be done before the object is instantiated.
Therefore it is necessary to overload __new__ method.
"""
value = value or "fuzzy"
return super().__new__(cls, value)

def __init__(self, value: Optional[str] = None):

self.convert: Callable
self.format_string: Optional[str] = None

super().__init__()

if isinstance(value, TimeFormat):
self.convert = value.convert
self.format_string = value.format_string
else:
self.convert, self.format_string = TimeFormat.validate(self)

def parse_datetime(self, value: str, return_datetime: bool = False) -> Union[datetime, str]:
"""
This function uses the selected conversion function to parse the datetime value.
:param value: external datetime string
:param return_datetime: whether to return string or datetime object
:return: parsed datetime or string
"""
if self.format_string:
return self.convert(value=value, format=self.format_string, return_datetime=return_datetime)
else:
return self.convert(value=value, return_datetime=return_datetime)

@staticmethod
def validate(value: str) -> [Callable, Optional[str]]:
"""
This function validates the time_format parameter value.
:param value: bot parameter for datetime conversion
:return: correct time conversion function and the format string
"""

split_value: List[str] = value.split('|')
conversion: Callable
conversion_name: str = split_value[0]
format_string: Optional[str] = split_value[1] if len(split_value) > 1 else None

# validation of the conversion name
if conversion_name in DateTime.TIME_CONVERSIONS.keys():
conversion = DateTime.TIME_CONVERSIONS[conversion_name]

else:
raise InvalidArgument(argument="time_format", got=value,
expected=[key for key in DateTime.TIME_CONVERSIONS.keys()])

# validate that we have format_string when the conversion function expects it
if not format_string and signature(conversion).parameters.get("format"):
raise InvalidArgument(argument="time_format", got=value,
expected=f"{conversion_name}|FORMAT_STRING")

# validate that we do not have format_string when the conversion function doesn't expect it
elif format_string and not signature(conversion).parameters.get("format"):
raise InvalidArgument(argument="time_format", got=value,
expected=conversion_name)

return conversion, format_string
4 changes: 2 additions & 2 deletions intelmq/lib/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class InvalidArgument(IntelMQException):
def __init__(self, argument: Any, got: Any = None, expected=None,
docs: str = None):
message = f"Argument {repr(argument)} is invalid."
if expected is list:
message += f" Should be one of: {list}."
if isinstance(expected, list):
message += f" Should be one of: {expected}."
elif expected: # not None
message += f" Should be of type: {expected}."
if got:
Expand Down
Loading

0 comments on commit 3be2460

Please sign in to comment.