Skip to content

Commit

Permalink
Merge pull request #182 from reportportal/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
HardNorth authored Sep 10, 2024
2 parents a403950 + 75f9f39 commit c9dbf20
Show file tree
Hide file tree
Showing 14 changed files with 222 additions and 193 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ WORKDIR /backend/
COPY --from=builder /backend ./
COPY --from=builder /venv /venv
COPY --from=builder /usr/share/nltk_data /usr/share/nltk_data/
RUN dnf -y upgrade && dnf -y install libgomp pcre-devel \
RUN dnf -y upgrade && dnf -y install pcre-devel \
&& dnf -y remove emacs-filesystem libjpeg-turbo libtiff libpng wget \
&& dnf -y autoremove \
&& dnf clean all \
Expand Down
22 changes: 12 additions & 10 deletions app/commons/clusterizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,20 @@ class Clusterizer:
def __init__(self):
pass

def calculate_hashes(self, messages, n_gram=2, n_permutations=64):
def calculate_hashes(self, messages: list[str], n_gram: int = 2, n_permutations: int = 64) -> list[list[str]]:
hashes = []
for message in messages:
words = message.split()
hash_print = set()
len_words = (len(words) - n_gram) if len(words) > n_gram else len(words)
for i in range(len_words):
hash_print.add(hashlib.md5(" ".join(words[i:i + n_gram]).encode("utf-8")).hexdigest())
hash_print = list(heapq.nlargest(n_permutations, hash_print))
hashes.append(hash_print)
hashes.append(list(heapq.nlargest(n_permutations, hash_print)))
return hashes

def find_groups_by_similarity(self, messages, groups_to_check, threshold=0.95):
def find_groups_by_similarity(
self, messages: list[str], groups_to_check: dict[int, list[int]],
threshold: float = 0.95) -> dict[int, list[int]]:
if len(messages) == 0:
return {}
rearranged_groups = {}
Expand All @@ -67,7 +68,9 @@ def find_groups_by_similarity(self, messages, groups_to_check, threshold=0.95):
logger.debug("Time for finding groups: %.2f s", time() - start_time)
return rearranged_groups

def similarity_groupping(self, hash_prints, block_size=1000, for_text=True, threshold=0.95):
def similarity_groupping(
self, hash_prints: list[list[str]] | list[str], block_size: int = 1000, for_text: bool = True,
threshold: float = 0.95) -> dict[int, int]:
num_of_blocks = int(np.ceil(len(hash_prints) / block_size))
hash_groups = {}
global_ind = 0
Expand Down Expand Up @@ -105,7 +108,7 @@ def similarity_groupping(self, hash_prints, block_size=1000, for_text=True, thre
hash_groups[j] = hash_groups[i]
return hash_groups

def unite_groups_by_hashes(self, messages, threshold=0.95):
def unite_groups_by_hashes(self, messages: list[str], threshold: float = 0.95) -> dict[int, list[int]]:
start_time = time()
hash_prints = self.calculate_hashes(messages)
has_no_empty = False
Expand All @@ -125,7 +128,7 @@ def unite_groups_by_hashes(self, messages, threshold=0.95):
logger.debug("Time for finding hash groups: %.2f s", time() - start_time)
return rearranged_groups

def perform_light_deduplication(self, messages):
def perform_light_deduplication(self, messages: list[str]) -> tuple[list[str], dict[int, list[int]]]:
text_messages_set = {}
messages_to_cluster = []
ids_with_duplicates = {}
Expand All @@ -142,11 +145,10 @@ def perform_light_deduplication(self, messages):
ids_with_duplicates[text_messages_set[text_message_normalized]].append(idx)
return messages_to_cluster, ids_with_duplicates

def find_clusters(self, messages, threshold=0.95):
def find_clusters(self, messages: list[str], threshold: float = 0.95) -> dict[int, list[int]]:
messages_to_cluster, ids_with_duplicates = self.perform_light_deduplication(messages)
hash_groups = self.unite_groups_by_hashes(messages_to_cluster, threshold=threshold)
groups = self.find_groups_by_similarity(
messages_to_cluster, hash_groups, threshold=threshold)
groups = self.find_groups_by_similarity(messages_to_cluster, hash_groups, threshold=threshold)
new_groups = {}
for cluster in groups:
new_log_ids = []
Expand Down
22 changes: 12 additions & 10 deletions app/commons/log_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import copy
from typing import Any, Optional

from app.utils import text_processing

Expand All @@ -34,9 +35,11 @@ def __init__(self):
"paths", "message_params", "detected_message_without_params_extended",
"whole_message"]

def merge_big_and_small_logs(self, logs: list[dict], log_level_ids_to_add: dict, log_level_messages: dict,
log_level_ids_merged, logs_ids_in_merged_logs) -> tuple[list[dict], dict]:
"""Merge big message logs with small ones"""
def merge_big_and_small_logs(
self, logs: list[dict[str, Any]], log_level_ids_to_add: dict[int, list[int]],
log_level_messages: dict[str, dict[int, str]], log_level_ids_merged: dict[int, dict[str, Any]],
logs_ids_in_merged_logs: dict[int, list[int]]) -> tuple[list[dict[str, Any]], dict[str, list[int]]]:
"""Merge big message logs with small ones."""
new_logs = []
for log in logs:
if not log["_source"]["message"].strip():
Expand All @@ -50,9 +53,7 @@ def merge_big_and_small_logs(self, logs: list[dict], log_level_ids_to_add: dict,

log_ids_for_merged_logs = {}
for log_level in log_level_messages["message"]:

if not log_level_ids_to_add[log_level] and \
log_level_messages["message"][log_level].strip():
if not log_level_ids_to_add[log_level] and log_level_messages["message"][log_level].strip():
log = log_level_ids_merged[log_level]
merged_logs_id = str(log["_id"]) + "_m"
new_log = self.prepare_new_log(
Expand All @@ -73,8 +74,9 @@ def merge_big_and_small_logs(self, logs: list[dict], log_level_ids_to_add: dict,
new_logs.append(new_log)
return new_logs, log_ids_for_merged_logs

def decompose_logs_merged_and_without_duplicates(self, logs: list[dict]) -> tuple[list[dict], dict]:
"""Merge big logs with small ones without duplicates"""
def decompose_logs_merged_and_without_duplicates(
self, logs: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, list[int]]]:
"""Merge big logs with small ones without duplicates."""
log_level_messages = {}
for field in self.fields_to_merge:
log_level_messages[field] = {}
Expand Down Expand Up @@ -127,8 +129,8 @@ def decompose_logs_merged_and_without_duplicates(self, logs: list[dict]) -> tupl
return self.merge_big_and_small_logs(
logs, log_level_ids_to_add, log_level_messages, log_level_ids_merged, logs_ids_in_merged_logs)

def prepare_new_log(self, old_log: dict, new_id, is_merged: bool, merged_small_logs: str,
fields_to_clean: list[str] | None = None) -> dict:
def prepare_new_log(self, old_log: dict[str, Any], new_id, is_merged: bool, merged_small_logs: str,
fields_to_clean: Optional[list[str]] = None) -> dict[str, Any]:
"""Prepare updated log"""
merged_log = copy.deepcopy(old_log)
merged_log["_source"]["is_merged"] = is_merged
Expand Down
103 changes: 54 additions & 49 deletions app/commons/log_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from datetime import datetime
from typing import Any

from app.commons.model.launch_objects import Launch, TestItem, Log, TestItemInfo
from app.commons.log_merger import LogMerger
Expand All @@ -21,47 +22,50 @@
from app.utils.log_preparation import basic_prepare


def create_log_template() -> dict:
return {
"_id": "",
"_index": "",
"_source": {
"launch_id": "",
"launch_name": "",
"launch_number": 0,
"launch_start_time": "",
"test_item": "",
"test_item_name": "",
"unique_id": "",
"cluster_id": "",
"cluster_message": "",
"test_case_hash": 0,
"is_auto_analyzed": False,
"issue_type": "",
"log_time": "",
"log_level": 0,
'original_message': '',
"original_message_lines": 0,
"original_message_words_number": 0,
"message": "",
"is_merged": False,
"start_time": "",
"merged_small_logs": "",
"detected_message": "",
"detected_message_with_numbers": "",
"stacktrace": "",
"only_numbers": "",
"found_exceptions": "",
"whole_message": "",
"potential_status_codes": "",
"found_tests_and_methods": "",
"cluster_with_numbers": False
}
}


class LogRequests:

def __init__(self):
self.log_merger = LogMerger()

@staticmethod
def _create_log_template() -> dict:
return {
"_id": "",
"_index": "",
"_source": {
"launch_id": "",
"launch_name": "",
"launch_number": 0,
"launch_start_time": "",
"test_item": "",
"test_item_name": "",
"unique_id": "",
"cluster_id": "",
"cluster_message": "",
"test_case_hash": 0,
"is_auto_analyzed": False,
"issue_type": "",
"log_time": "",
"log_level": 0,
"original_message_lines": 0,
"original_message_words_number": 0,
"message": "",
"is_merged": False,
"start_time": "",
"merged_small_logs": "",
"detected_message": "",
"detected_message_with_numbers": "",
"stacktrace": "",
"only_numbers": "",
"found_exceptions": "",
"whole_message": "",
"potential_status_codes": "",
"found_tests_and_methods": "",
"cluster_with_numbers": False}}

@staticmethod
def transform_issue_type_into_lowercase(issue_type):
return issue_type[:2].lower() + issue_type[2:]
Expand All @@ -79,21 +83,20 @@ def _fill_launch_test_item_fields(log_template: dict, launch: Launch, test_item:
log_template["_source"]["test_case_hash"] = test_item.testCaseHash
log_template["_source"]["is_auto_analyzed"] = test_item.isAutoAnalyzed
log_template["_source"]["test_item_name"] = text_processing.preprocess_test_item_name(test_item.testItemName)
log_template["_source"]["issue_type"] = LogRequests.transform_issue_type_into_lowercase(
test_item.issueType)
log_template["_source"]["start_time"] = datetime(
*test_item.startTime[:6]).strftime("%Y-%m-%d %H:%M:%S")
log_template["_source"]["issue_type"] = LogRequests.transform_issue_type_into_lowercase(test_item.issueType)
log_template["_source"]["start_time"] = datetime(*test_item.startTime[:6]).strftime("%Y-%m-%d %H:%M:%S")
return log_template

@staticmethod
def _fill_log_fields(log_template: dict, log: Log, number_of_lines: int):
def _fill_log_fields(log_template: dict, log: Log, number_of_lines: int) -> dict[str, Any]:
prepared_log = PreparedLogMessage(log.message, number_of_lines)
log_template["_id"] = log.logId
log_template["_source"]["log_time"] = datetime(*log.logTime[:6]).strftime("%Y-%m-%d %H:%M:%S")
log_template["_source"]["cluster_id"] = str(log.clusterId)
log_template["_source"]["cluster_message"] = log.clusterMessage
log_template["_source"]["cluster_with_numbers"] = utils.extract_clustering_setting(log.clusterId)
log_template["_source"]["log_level"] = log.logLevel
log_template["_source"]['original_message'] = log.message
log_template["_source"]["original_message_lines"] = text_processing.calculate_line_number(
prepared_log.clean_message)
log_template["_source"]["original_message_words_number"] = len(
Expand Down Expand Up @@ -137,13 +140,13 @@ def _fill_log_fields(log_template: dict, log: Log, number_of_lines: int):

@staticmethod
def _prepare_log(launch: Launch, test_item: TestItem, log: Log, project: str) -> dict:
log_template = LogRequests._create_log_template()
log_template = create_log_template()
log_template = LogRequests._fill_launch_test_item_fields(log_template, launch, test_item, project)
log_template = LogRequests._fill_log_fields(log_template, log, launch.analyzerConfig.numberOfLogLines)
return log_template

@staticmethod
def _fill_test_item_info_fields(log_template: dict, test_item_info: TestItemInfo, project: str) -> dict:
def _fill_test_item_info_fields(log_template: dict, test_item_info: TestItemInfo, project: str) -> dict[str, Any]:
log_template["_index"] = project
log_template["_source"]["launch_id"] = test_item_info.launchId
log_template["_source"]["launch_name"] = test_item_info.launchName
Expand All @@ -160,7 +163,7 @@ def _fill_test_item_info_fields(log_template: dict, test_item_info: TestItemInfo

@staticmethod
def _prepare_log_for_suggests(test_item_info: TestItemInfo, log: Log, project: str) -> dict:
log_template = LogRequests._create_log_template()
log_template = create_log_template()
log_template = LogRequests._fill_test_item_info_fields(log_template, test_item_info, project)
log_template = LogRequests._fill_log_fields(
log_template, log, test_item_info.analyzerConfig.numberOfLogLines)
Expand All @@ -184,14 +187,15 @@ def prepare_log_words(launches: list[Launch]) -> tuple[dict[str, int], int]:
return log_words, project

@staticmethod
def prepare_log_clustering_light(launch: Launch, test_item: TestItem, log: Log, project: str):
log_template = LogRequests._create_log_template()
def prepare_log_clustering_light(launch: Launch, test_item: TestItem, log: Log, project: str) -> dict[str, Any]:
log_template = create_log_template()
log_template = LogRequests._fill_launch_test_item_fields(log_template, launch, test_item, project)
prepared_log = PreparedLogMessage(log.message, -1)
log_template["_id"] = log.logId
log_template["_source"]["cluster_id"] = str(log.clusterId)
log_template["_source"]["cluster_message"] = log.clusterMessage
log_template["_source"]["log_level"] = log.logLevel
log_template["_source"]['original_message'] = log.message
log_template["_source"]["original_message_lines"] = text_processing.calculate_line_number(
prepared_log.clean_message)
log_template["_source"]["original_message_words_number"] = len(
Expand All @@ -206,7 +210,8 @@ def prepare_log_clustering_light(launch: Launch, test_item: TestItem, log: Log,
+ prepared_log.stacktrace)
return log_template

def prepare_logs_for_clustering(self, launch: Launch, number_of_lines: int, clean_numbers: bool, project: str):
def prepare_logs_for_clustering(self, launch: Launch, number_of_lines: int, clean_numbers: bool,
project: str) -> tuple[list[str], dict[int, dict[str, Any]], dict[str, list[int]]]:
log_messages = []
log_dict = {}
ind = 0
Expand All @@ -219,8 +224,8 @@ def prepare_logs_for_clustering(self, launch: Launch, number_of_lines: int, clea
prepared_logs.append(LogRequests.prepare_log_clustering_light(launch, test_item, log, project))
merged_logs, log_ids_for_merged_logs = self.log_merger.decompose_logs_merged_and_without_duplicates(
prepared_logs)
for _id in log_ids_for_merged_logs:
full_log_ids_for_merged_logs[_id] = log_ids_for_merged_logs[_id]
for _id, merged_list in log_ids_for_merged_logs.items():
full_log_ids_for_merged_logs[_id] = merged_list
for log in merged_logs:
number_of_log_lines = number_of_lines
if log["_source"]["is_merged"]:
Expand Down
Loading

0 comments on commit c9dbf20

Please sign in to comment.