Merge pull request #182 from reportportal/develop

Develop
reportportal · Sep 10, 2024 · c9dbf20 · c9dbf20
2 parents a403950 + 75f9f39
commit c9dbf20
Show file tree

Hide file tree

Showing 14 changed files with 222 additions and 193 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -44,7 +44,7 @@ WORKDIR /backend/
 COPY --from=builder /backend ./
 COPY --from=builder /venv /venv
 COPY --from=builder /usr/share/nltk_data /usr/share/nltk_data/
-RUN dnf -y upgrade && dnf -y install libgomp pcre-devel \
+RUN dnf -y upgrade && dnf -y install pcre-devel \
     && dnf -y remove emacs-filesystem libjpeg-turbo libtiff libpng wget \
     && dnf -y autoremove \
     && dnf clean all \

diff --git a/app/commons/clusterizer.py b/app/commons/clusterizer.py
@@ -31,19 +31,20 @@ class Clusterizer:
     def __init__(self):
         pass
 
-    def calculate_hashes(self, messages, n_gram=2, n_permutations=64):
+    def calculate_hashes(self, messages: list[str], n_gram: int = 2, n_permutations: int = 64) -> list[list[str]]:
         hashes = []
         for message in messages:
             words = message.split()
             hash_print = set()
             len_words = (len(words) - n_gram) if len(words) > n_gram else len(words)
             for i in range(len_words):
                 hash_print.add(hashlib.md5(" ".join(words[i:i + n_gram]).encode("utf-8")).hexdigest())
-            hash_print = list(heapq.nlargest(n_permutations, hash_print))
-            hashes.append(hash_print)
+            hashes.append(list(heapq.nlargest(n_permutations, hash_print)))
         return hashes
 
-    def find_groups_by_similarity(self, messages, groups_to_check, threshold=0.95):
+    def find_groups_by_similarity(
+            self, messages: list[str], groups_to_check: dict[int, list[int]],
+            threshold: float = 0.95) -> dict[int, list[int]]:
         if len(messages) == 0:
             return {}
         rearranged_groups = {}
@@ -67,7 +68,9 @@ def find_groups_by_similarity(self, messages, groups_to_check, threshold=0.95):
         logger.debug("Time for finding groups: %.2f s", time() - start_time)
         return rearranged_groups
 
-    def similarity_groupping(self, hash_prints, block_size=1000, for_text=True, threshold=0.95):
+    def similarity_groupping(
+            self, hash_prints: list[list[str]] | list[str], block_size: int = 1000, for_text: bool = True,
+            threshold: float = 0.95) -> dict[int, int]:
         num_of_blocks = int(np.ceil(len(hash_prints) / block_size))
         hash_groups = {}
         global_ind = 0
@@ -105,7 +108,7 @@ def similarity_groupping(self, hash_prints, block_size=1000, for_text=True, thre
                                 hash_groups[j] = hash_groups[i]
         return hash_groups
 
-    def unite_groups_by_hashes(self, messages, threshold=0.95):
+    def unite_groups_by_hashes(self, messages: list[str], threshold: float = 0.95) -> dict[int, list[int]]:
         start_time = time()
         hash_prints = self.calculate_hashes(messages)
         has_no_empty = False
@@ -125,7 +128,7 @@ def unite_groups_by_hashes(self, messages, threshold=0.95):
         logger.debug("Time for finding hash groups: %.2f s", time() - start_time)
         return rearranged_groups
 
-    def perform_light_deduplication(self, messages):
+    def perform_light_deduplication(self, messages: list[str]) -> tuple[list[str], dict[int, list[int]]]:
         text_messages_set = {}
         messages_to_cluster = []
         ids_with_duplicates = {}
@@ -142,11 +145,10 @@ def perform_light_deduplication(self, messages):
                 ids_with_duplicates[text_messages_set[text_message_normalized]].append(idx)
         return messages_to_cluster, ids_with_duplicates
 
-    def find_clusters(self, messages, threshold=0.95):
+    def find_clusters(self, messages: list[str], threshold: float = 0.95) -> dict[int, list[int]]:
         messages_to_cluster, ids_with_duplicates = self.perform_light_deduplication(messages)
         hash_groups = self.unite_groups_by_hashes(messages_to_cluster, threshold=threshold)
-        groups = self.find_groups_by_similarity(
-            messages_to_cluster, hash_groups, threshold=threshold)
+        groups = self.find_groups_by_similarity(messages_to_cluster, hash_groups, threshold=threshold)
         new_groups = {}
         for cluster in groups:
             new_log_ids = []

diff --git a/app/commons/log_merger.py b/app/commons/log_merger.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 import copy
+from typing import Any, Optional
 
 from app.utils import text_processing
 
@@ -34,9 +35,11 @@ def __init__(self):
                                 "paths", "message_params", "detected_message_without_params_extended",
                                 "whole_message"]
 
-    def merge_big_and_small_logs(self, logs: list[dict], log_level_ids_to_add: dict, log_level_messages: dict,
-                                 log_level_ids_merged, logs_ids_in_merged_logs) -> tuple[list[dict], dict]:
-        """Merge big message logs with small ones"""
+    def merge_big_and_small_logs(
+            self, logs: list[dict[str, Any]], log_level_ids_to_add: dict[int, list[int]],
+            log_level_messages: dict[str, dict[int, str]], log_level_ids_merged: dict[int, dict[str, Any]],
+            logs_ids_in_merged_logs: dict[int, list[int]]) -> tuple[list[dict[str, Any]], dict[str, list[int]]]:
+        """Merge big message logs with small ones."""
         new_logs = []
         for log in logs:
             if not log["_source"]["message"].strip():
@@ -50,9 +53,7 @@ def merge_big_and_small_logs(self, logs: list[dict], log_level_ids_to_add: dict,
 
         log_ids_for_merged_logs = {}
         for log_level in log_level_messages["message"]:
-
-            if not log_level_ids_to_add[log_level] and \
-                    log_level_messages["message"][log_level].strip():
+            if not log_level_ids_to_add[log_level] and log_level_messages["message"][log_level].strip():
                 log = log_level_ids_merged[log_level]
                 merged_logs_id = str(log["_id"]) + "_m"
                 new_log = self.prepare_new_log(
@@ -73,8 +74,9 @@ def merge_big_and_small_logs(self, logs: list[dict], log_level_ids_to_add: dict,
                 new_logs.append(new_log)
         return new_logs, log_ids_for_merged_logs
 
-    def decompose_logs_merged_and_without_duplicates(self, logs: list[dict]) -> tuple[list[dict], dict]:
-        """Merge big logs with small ones without duplicates"""
+    def decompose_logs_merged_and_without_duplicates(
+            self, logs: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, list[int]]]:
+        """Merge big logs with small ones without duplicates."""
         log_level_messages = {}
         for field in self.fields_to_merge:
             log_level_messages[field] = {}
@@ -127,8 +129,8 @@ def decompose_logs_merged_and_without_duplicates(self, logs: list[dict]) -> tupl
         return self.merge_big_and_small_logs(
             logs, log_level_ids_to_add, log_level_messages, log_level_ids_merged, logs_ids_in_merged_logs)
 
-    def prepare_new_log(self, old_log: dict, new_id, is_merged: bool, merged_small_logs: str,
-                        fields_to_clean: list[str] | None = None) -> dict:
+    def prepare_new_log(self, old_log: dict[str, Any], new_id, is_merged: bool, merged_small_logs: str,
+                        fields_to_clean: Optional[list[str]] = None) -> dict[str, Any]:
         """Prepare updated log"""
         merged_log = copy.deepcopy(old_log)
         merged_log["_source"]["is_merged"] = is_merged

diff --git a/app/commons/log_requests.py b/app/commons/log_requests.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 from datetime import datetime
+from typing import Any
 
 from app.commons.model.launch_objects import Launch, TestItem, Log, TestItemInfo
 from app.commons.log_merger import LogMerger
@@ -21,47 +22,50 @@
 from app.utils.log_preparation import basic_prepare
 
 
+def create_log_template() -> dict:
+    return {
+        "_id": "",
+        "_index": "",
+        "_source": {
+            "launch_id": "",
+            "launch_name": "",
+            "launch_number": 0,
+            "launch_start_time": "",
+            "test_item": "",
+            "test_item_name": "",
+            "unique_id": "",
+            "cluster_id": "",
+            "cluster_message": "",
+            "test_case_hash": 0,
+            "is_auto_analyzed": False,
+            "issue_type": "",
+            "log_time": "",
+            "log_level": 0,
+            'original_message': '',
+            "original_message_lines": 0,
+            "original_message_words_number": 0,
+            "message": "",
+            "is_merged": False,
+            "start_time": "",
+            "merged_small_logs": "",
+            "detected_message": "",
+            "detected_message_with_numbers": "",
+            "stacktrace": "",
+            "only_numbers": "",
+            "found_exceptions": "",
+            "whole_message": "",
+            "potential_status_codes": "",
+            "found_tests_and_methods": "",
+            "cluster_with_numbers": False
+        }
+    }
+
+
 class LogRequests:
 
     def __init__(self):
         self.log_merger = LogMerger()
 
-    @staticmethod
-    def _create_log_template() -> dict:
-        return {
-            "_id": "",
-            "_index": "",
-            "_source": {
-                "launch_id": "",
-                "launch_name": "",
-                "launch_number": 0,
-                "launch_start_time": "",
-                "test_item": "",
-                "test_item_name": "",
-                "unique_id": "",
-                "cluster_id": "",
-                "cluster_message": "",
-                "test_case_hash": 0,
-                "is_auto_analyzed": False,
-                "issue_type": "",
-                "log_time": "",
-                "log_level": 0,
-                "original_message_lines": 0,
-                "original_message_words_number": 0,
-                "message": "",
-                "is_merged": False,
-                "start_time": "",
-                "merged_small_logs": "",
-                "detected_message": "",
-                "detected_message_with_numbers": "",
-                "stacktrace": "",
-                "only_numbers": "",
-                "found_exceptions": "",
-                "whole_message": "",
-                "potential_status_codes": "",
-                "found_tests_and_methods": "",
-                "cluster_with_numbers": False}}
-
     @staticmethod
     def transform_issue_type_into_lowercase(issue_type):
         return issue_type[:2].lower() + issue_type[2:]
@@ -79,21 +83,20 @@ def _fill_launch_test_item_fields(log_template: dict, launch: Launch, test_item:
         log_template["_source"]["test_case_hash"] = test_item.testCaseHash
         log_template["_source"]["is_auto_analyzed"] = test_item.isAutoAnalyzed
         log_template["_source"]["test_item_name"] = text_processing.preprocess_test_item_name(test_item.testItemName)
-        log_template["_source"]["issue_type"] = LogRequests.transform_issue_type_into_lowercase(
-            test_item.issueType)
-        log_template["_source"]["start_time"] = datetime(
-            *test_item.startTime[:6]).strftime("%Y-%m-%d %H:%M:%S")
+        log_template["_source"]["issue_type"] = LogRequests.transform_issue_type_into_lowercase(test_item.issueType)
+        log_template["_source"]["start_time"] = datetime(*test_item.startTime[:6]).strftime("%Y-%m-%d %H:%M:%S")
         return log_template
 
     @staticmethod
-    def _fill_log_fields(log_template: dict, log: Log, number_of_lines: int):
+    def _fill_log_fields(log_template: dict, log: Log, number_of_lines: int) -> dict[str, Any]:
         prepared_log = PreparedLogMessage(log.message, number_of_lines)
         log_template["_id"] = log.logId
         log_template["_source"]["log_time"] = datetime(*log.logTime[:6]).strftime("%Y-%m-%d %H:%M:%S")
         log_template["_source"]["cluster_id"] = str(log.clusterId)
         log_template["_source"]["cluster_message"] = log.clusterMessage
         log_template["_source"]["cluster_with_numbers"] = utils.extract_clustering_setting(log.clusterId)
         log_template["_source"]["log_level"] = log.logLevel
+        log_template["_source"]['original_message'] = log.message
         log_template["_source"]["original_message_lines"] = text_processing.calculate_line_number(
             prepared_log.clean_message)
         log_template["_source"]["original_message_words_number"] = len(
@@ -137,13 +140,13 @@ def _fill_log_fields(log_template: dict, log: Log, number_of_lines: int):
 
     @staticmethod
     def _prepare_log(launch: Launch, test_item: TestItem, log: Log, project: str) -> dict:
-        log_template = LogRequests._create_log_template()
+        log_template = create_log_template()
         log_template = LogRequests._fill_launch_test_item_fields(log_template, launch, test_item, project)
         log_template = LogRequests._fill_log_fields(log_template, log, launch.analyzerConfig.numberOfLogLines)
         return log_template
 
     @staticmethod
-    def _fill_test_item_info_fields(log_template: dict, test_item_info: TestItemInfo, project: str) -> dict:
+    def _fill_test_item_info_fields(log_template: dict, test_item_info: TestItemInfo, project: str) -> dict[str, Any]:
         log_template["_index"] = project
         log_template["_source"]["launch_id"] = test_item_info.launchId
         log_template["_source"]["launch_name"] = test_item_info.launchName
@@ -160,7 +163,7 @@ def _fill_test_item_info_fields(log_template: dict, test_item_info: TestItemInfo
 
     @staticmethod
     def _prepare_log_for_suggests(test_item_info: TestItemInfo, log: Log, project: str) -> dict:
-        log_template = LogRequests._create_log_template()
+        log_template = create_log_template()
         log_template = LogRequests._fill_test_item_info_fields(log_template, test_item_info, project)
         log_template = LogRequests._fill_log_fields(
             log_template, log, test_item_info.analyzerConfig.numberOfLogLines)
@@ -184,14 +187,15 @@ def prepare_log_words(launches: list[Launch]) -> tuple[dict[str, int], int]:
         return log_words, project
 
     @staticmethod
-    def prepare_log_clustering_light(launch: Launch, test_item: TestItem, log: Log, project: str):
-        log_template = LogRequests._create_log_template()
+    def prepare_log_clustering_light(launch: Launch, test_item: TestItem, log: Log, project: str) -> dict[str, Any]:
+        log_template = create_log_template()
         log_template = LogRequests._fill_launch_test_item_fields(log_template, launch, test_item, project)
         prepared_log = PreparedLogMessage(log.message, -1)
         log_template["_id"] = log.logId
         log_template["_source"]["cluster_id"] = str(log.clusterId)
         log_template["_source"]["cluster_message"] = log.clusterMessage
         log_template["_source"]["log_level"] = log.logLevel
+        log_template["_source"]['original_message'] = log.message
         log_template["_source"]["original_message_lines"] = text_processing.calculate_line_number(
             prepared_log.clean_message)
         log_template["_source"]["original_message_words_number"] = len(
@@ -206,7 +210,8 @@ def prepare_log_clustering_light(launch: Launch, test_item: TestItem, log: Log,
                                                     + prepared_log.stacktrace)
         return log_template
 
-    def prepare_logs_for_clustering(self, launch: Launch, number_of_lines: int, clean_numbers: bool, project: str):
+    def prepare_logs_for_clustering(self, launch: Launch, number_of_lines: int, clean_numbers: bool,
+                                    project: str) -> tuple[list[str], dict[int, dict[str, Any]], dict[str, list[int]]]:
         log_messages = []
         log_dict = {}
         ind = 0
@@ -219,8 +224,8 @@ def prepare_logs_for_clustering(self, launch: Launch, number_of_lines: int, clea
                 prepared_logs.append(LogRequests.prepare_log_clustering_light(launch, test_item, log, project))
             merged_logs, log_ids_for_merged_logs = self.log_merger.decompose_logs_merged_and_without_duplicates(
                 prepared_logs)
-            for _id in log_ids_for_merged_logs:
-                full_log_ids_for_merged_logs[_id] = log_ids_for_merged_logs[_id]
+            for _id, merged_list in log_ids_for_merged_logs.items():
+                full_log_ids_for_merged_logs[_id] = merged_list
             for log in merged_logs:
                 number_of_log_lines = number_of_lines
                 if log["_source"]["is_merged"]: