From e6ea3cc88d1654515519701b060a4c21098fcc8c Mon Sep 17 00:00:00 2001 From: TwistedTwigleg Date: Mon, 1 May 2023 13:55:16 -0400 Subject: [PATCH] Adjust the canary not to crash if it cannot poll the metric alarms (#279) Adjust the canary not to crash if it cannot poll the metric alarms. Improve exception printing. --- codebuild/CanaryWrapper.py | 2 +- codebuild/CanaryWrapper_24_7.py | 2 +- codebuild/CanaryWrapper_Classes.py | 82 ++++++++++++++-------- codebuild/CanaryWrapper_MetricFunctions.py | 6 +- 4 files changed, 57 insertions(+), 35 deletions(-) diff --git a/codebuild/CanaryWrapper.py b/codebuild/CanaryWrapper.py index c089ffd5..f3819137 100644 --- a/codebuild/CanaryWrapper.py +++ b/codebuild/CanaryWrapper.py @@ -298,7 +298,7 @@ def application_thread(): finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!" except Exception as e: print ("ERROR: Could not (possibly) cut ticket due to exception!") - print ("Exception: " + str(e), flush=True) + print (f"Exception: {repr(e)}", flush=True) # Clean everything up and stop snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) diff --git a/codebuild/CanaryWrapper_24_7.py b/codebuild/CanaryWrapper_24_7.py index 877b8259..034f85f4 100644 --- a/codebuild/CanaryWrapper_24_7.py +++ b/codebuild/CanaryWrapper_24_7.py @@ -369,7 +369,7 @@ def application_thread(): finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!" except Exception as e: print ("ERROR: Could not (possibly) cut ticket due to exception!") - print ("Exception: " + str(e), flush=True) + print (f"Exception: {repr(e)}", flush=True) # Clean everything up and stop snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) diff --git a/codebuild/CanaryWrapper_Classes.py b/codebuild/CanaryWrapper_Classes.py index 01f39062..75346a74 100644 --- a/codebuild/CanaryWrapper_Classes.py +++ b/codebuild/CanaryWrapper_Classes.py @@ -67,14 +67,14 @@ def add_metric_to_widget(self, new_metric_name): self.metric_list.append(new_metric_name) except Exception as e: print ("[DataSnapshot_Dashboard] ERROR - could not add metric to dashboard widget due to exception!") - print ("[DataSnapshot_Dashboard] Exception: " + str(e)) + print (f"[DataSnapshot_Dashboard] Exception: {repr(e)}") def remove_metric_from_widget(self, existing_metric_name): try: self.metric_list.remove(existing_metric_name) except Exception as e: print ("[DataSnapshot_Dashboard] ERROR - could not remove metric from dashboard widget due to exception!") - print ("[DataSnapshot_Dashboard] Exception: " + str(e)) + print (f"[DataSnapshot_Dashboard] Exception: {repr(e)}") def get_widget_dictionary(self): metric_list_json = [] @@ -170,6 +170,7 @@ def __init__(self, tmp_sts_client.get_caller_identity() except Exception as e: print ("[DataSnapshot] ERROR - AWS credentials are NOT valid!") + print (f"[DataSnapshot] ERROR - Exception: {repr(e)}") self.abort_due_to_internal_error = True self.abort_due_to_internal_error_reason = "AWS credentials are NOT valid!" self.abort_due_to_internal_error_due_to_credentials = True @@ -204,7 +205,7 @@ def __init__(self, self.cloudwatch_dashboard_name = self.git_metric_namespace except Exception as e: self.print_message("[DataSnapshot] ERROR - could not make Cloudwatch client due to exception!") - self.print_message("[DataSnapshot] Exception: " + str(e)) + self.print_message(f"[DataSnapshot] Exception: {repr(e)}") self.cloudwatch_client = None self.abort_due_to_internal_error = True self.abort_due_to_internal_error_reason = "Could not make Cloudwatch client!" @@ -217,7 +218,7 @@ def __init__(self, self.s3_client = boto3.client("s3") except Exception as e: self.print_message("[DataSnapshot] ERROR - could not make S3 client due to exception!") - self.print_message("[DataSnapshot] Exception: " + str(e)) + self.print_message(f"[DataSnapshot] Exception: {repr(e)}") self.s3_client = None self.abort_due_to_internal_error = True self.abort_due_to_internal_error_reason = "Could not make S3 client!" @@ -230,7 +231,7 @@ def __init__(self, self.lambda_client = boto3.client("lambda", self.cloudwatch_region) except Exception as e: self.print_message("[DataSnapshot] ERROR - could not make Lambda client due to exception!") - self.print_message("[DataSnapshot] Exception: " + str(e)) + self.print_message(f"[DataSnapshot] Exception: {repr(e)}") self.lambda_client = None self.abort_due_to_internal_error = True self.abort_due_to_internal_error_reason = "Could not make Lambda client!" @@ -274,7 +275,8 @@ def print_message(self, message): self.output_file.write(message + "\n") except Exception as ex: - print (f"[DataSnapshot] Exception trying to print to file: {ex}") + print (f"[DataSnapshot] ERROR - Exception trying to print to file") + print (f"[DataSnapshot] ERROR - Exception: {repr(ex)}") if (self.output_file is not None): self.output_file.close() self.output_file = None @@ -315,9 +317,10 @@ def _init_cloudwatch_pre_first_run_dashboard(self): DashboardBody= new_dashboard_body_json) self.print_message("[DataSnapshot] Added Cloudwatch dashboard successfully") except Exception as e: - self.print_message(f"[DataSnapshot] ERROR - Cloudwatch client could not make dashboard due to exception: {e}") + self.print_message(f"[DataSnapshot] ERROR - Cloudwatch client could not make dashboard due to exception") + self.print_message(f"[DataSnapshot] ERROR - Exception: {repr(e)}") self.abort_due_to_internal_error = True - self.abort_due_to_internal_error_reason = f"Cloudwatch client could not make dashboard due to exception {e}" + self.abort_due_to_internal_error_reason = f"Cloudwatch client could not make dashboard due to exception" return # Utility function - The function that adds each individual metric alarm. @@ -341,9 +344,10 @@ def _add_cloudwatch_metric_alarm(self, metric): ComparisonOperator="GreaterThanOrEqualToThreshold", ) except Exception as e: - self.print_message(f"[DataSnapshot] ERROR - could not register alarm for metric {metric.metric_name} due to exception: {e}") + self.print_message(f"[DataSnapshot] ERROR - could not register alarm for metric {metric.metric_name} due to exception") + self.print_message(f"[DataSnapshot] ERROR - Exception {repr(e)}") self.abort_due_to_internal_error = True - self.abort_due_to_internal_error_reason = f"Cloudwatch client could not make alarm due to exception: {e}" + self.abort_due_to_internal_error_reason = f"Cloudwatch client could not make alarm due to exception" # Utility function - removes all the Cloudwatch alarms for the metrics def _cleanup_cloudwatch_alarms(self): @@ -353,7 +357,8 @@ def _cleanup_cloudwatch_alarms(self): if (not metric.metric_alarm_threshold is None): self.cloudwatch_client.delete_alarms(AlarmNames=[metric.metric_alarm_name]) except Exception as e: - self.print_message(f"[DataSnapshot] ERROR - could not delete alarms due to exception: {e}") + self.print_message(f"[DataSnapshot] ERROR - could not delete alarms due to exception") + self.print_message(f"[DataSnapshot] ERROR - Exception {repr(e)}") # Utility function - removes all Cloudwatch dashboards created def _cleanup_cloudwatch_dashboard(self): @@ -362,7 +367,8 @@ def _cleanup_cloudwatch_dashboard(self): self.cloudwatch_client.delete_dashboards(DashboardNames=[self.cloudwatch_dashboard_name]) self.print_message("[DataSnapshot] Cloudwatch Dashboards deleted successfully!") except Exception as e: - self.print_message(f"[DataSnapshot] ERROR - dashboard cleaning function failed due to exception: {e}") + self.print_message(f"[DataSnapshot] ERROR - dashboard cleaning function failed due to exception") + self.print_message(f"[DataSnapshot] ERROR - Exception {repr(e)}") self.abort_due_to_internal_error = True self.abort_due_to_internal_error_reason = "Cloudwatch dashboard cleaning function failed due to exception" return @@ -408,7 +414,8 @@ def _check_cloudwatch_alarm_state_metric(self, metric): return return_result except Exception as e: - self.print_message(f"[DataSnapshot] ERROR - checking cloudwatch alarm failed due to exception: {e}") + self.print_message(f"[DataSnapshot] ERROR - checking cloudwatch alarm failed due to exception") + self.print_message(f"[DataSnapshot] ERROR - Exception {repr(e)}") return None # Exports a file with the same name as the commit Git hash to an S3 bucket in a folder with the Git repo name. @@ -461,7 +468,8 @@ def export_result_to_s3_bucket(self, copy_output_log=False, log_is_error=False): self.s3_client.upload_file(self.git_hash + ".log", self.s3_bucket_name, self.git_repo_name + "/Failed_Logs/" + self.datetime_string + "/" + self.git_hash + ".log") self.print_message("[DataSnapshot] Uploaded to S3!") except Exception as e: - self.print_message(f"[DataSnapshot] ERROR - could not upload to S3 due to exception: {e}") + self.print_message(f"[DataSnapshot] ERROR - could not upload to S3 due to exception") + self.print_message(f"[DataSnapshot] ERROR - Exception {repr(e)}") self.abort_due_to_internal_error = True self.abort_due_to_internal_error_reason = "S3 client had exception and therefore could not upload log!" os.remove(self.git_hash + ".log") @@ -485,7 +493,8 @@ def lambda_send_email(self, message, subject): Payload=payload_string ) except Exception as e: - self.print_message(f"[DataSnapshot] ERROR - could not send email via Lambda due to exception: {e}") + self.print_message(f"[DataSnapshot] ERROR - could not send email via Lambda due to exception") + self.print_message(f"[DataSnapshot] ERROR - Exception {repr(e)}") self.abort_due_to_internal_error = True self.abort_due_to_internal_error_reason = "Lambda email function had an exception!" return @@ -621,7 +630,8 @@ def export_metrics_cloudwatch(self): MetricData=metrics_data) self.print_message("[DataSnapshot] Metrics sent to Cloudwatch.") except Exception as e: - self.print_message(f"[DataSnapshot] Error - something when wrong posting cloudwatch metrics. Exception: {e}") + self.print_message(f"[DataSnapshot] Error - something when wrong posting cloudwatch metrics") + self.print_message(f"[DataSnapshot] ERROR - Exception {repr(e)}") self.print_message("[DataSnapshot] Not going to crash - just going to try again later") return @@ -741,7 +751,8 @@ def register_metric(self, new_metric_name, new_metric_function, new_metric_unit= new_metric_reports_to_skip=new_metric_reports_to_skip, new_metric_alarm_severity=new_metric_alarm_severity) except Exception as e: - self.print_message(f"[SnaptshotMonitor] ERROR - could not register metric in data snapshot due to exception: {e}") + self.print_message(f"[SnaptshotMonitor] ERROR - could not register metric in data snapshot due to exception") + self.print_message(f"[SnaptshotMonitor] ERROR - Exception {repr(e)}") self.had_internal_error = True self.internal_error_reason = "Could not register metric in data snapshot due to exception" return @@ -829,9 +840,8 @@ def monitor_loop_function(self, psutil_process : psutil.Process, time_passed=30) self.check_alarms_for_new_alarms(triggered_alarms) except Exception as e: self.print_message("[SnaptshotMonitor] ERROR - exception occurred checking metric alarms!") - self.print_message("[SnaptshotMonitor] (Likely session credentials expired)") - self.had_internal_error = True - self.internal_error_reason = "Exception occurred checking metric alarms! Likely session credentials expired" + self.print_message(f"[SnaptshotMonitor] ERROR - Exception {repr(e)}") + self.print_message("[SnaptshotMonitor] Not going to crash - just going to try again later") return if (self.metric_post_timer <= 0): @@ -840,8 +850,10 @@ def monitor_loop_function(self, psutil_process : psutil.Process, time_passed=30) self.data_snapshot.post_metrics(psutil_process) except Exception as e: self.print_message("[SnaptshotMonitor] ERROR - exception occurred posting metrics!") - self.print_message("[SnaptshotMonitor] (Likely session credentials expired)") + self.print_message(f"[SnaptshotMonitor] ERROR - Exception {repr(e)}") self.print_message("[SnaptshotMonitor] Not going to crash - just going to try again later") + # reset the timer + self.metric_post_timer += self.metric_post_timer_time return # reset the timer @@ -910,7 +922,7 @@ def start_monitoring(self): self.print_message ("[ApplicationMonitor] Application started...") except Exception as e: self.print_message ("[ApplicationMonitor] ERROR - Could not launch Canary/Application due to exception!") - self.print_message ("[ApplicationMonitor] Exception: " + str(e)) + self.print_message(f"[ApplicationMonitor] ERROR - Exception {repr(e)}") self.error_has_occurred = True self.error_reason = "Could not launch Canary/Application due to exception" self.error_code = 1 @@ -928,7 +940,8 @@ def restart_monitoring(self): self.print_message("\n[ApplicationMonitor] Restarted monitor application!") self.print_message("================================================================================") except Exception as e: - self.print_message(f"[ApplicationMonitor] ERROR - Could not restart Canary/Application due to exception: {e}") + self.print_message(f"[ApplicationMonitor] ERROR - Could not restart Canary/Application due to exception") + self.print_message(f"[ApplicationMonitor] ERROR - Exception {repr(e)}") self.error_has_occurred = True self.error_reason = "Could not restart Canary/Application due to exception" self.error_code = 1 @@ -961,7 +974,8 @@ def print_stdout(self): self.print_message(stdout_file.read()) os.remove(self.stdout_file_path) except Exception as e: - self.print_message(f"[ApplicationMonitor] ERROR - Could not print Canary/Application stdout to exception: {e}") + self.print_message(f"[ApplicationMonitor] ERROR - Could not print Canary/Application stdout to exception") + self.print_message(f"[ApplicationMonitor] ERROR - Exception {repr(e)}") def monitor_loop_function(self, time_passed=30): if (self.application_process != None): @@ -971,7 +985,7 @@ def monitor_loop_function(self, time_passed=30): application_process_return_code = self.application_process.poll() except Exception as e: self.print_message("[ApplicationMonitor] ERROR - exception occurred while trying to poll application status!") - self.print_message("[ApplicationMonitor] Exception: " + str(e)) + self.print_message(f"[ApplicationMonitor] ERROR - Exception {repr(e)}") self.error_has_occurred = True self.error_reason = "Exception when polling application status" self.error_code = 1 @@ -1048,6 +1062,7 @@ def __init__(self, s3_bucket_name, s3_file_name, s3_file_name_in_zip, canary_loc tmp_sts_client.get_caller_identity() except Exception as e: self.print_message("[S3Monitor] ERROR - (S3 Check) AWS credentials are NOT valid!") + self.print_message(f"[S3Monitor] ERROR - Exception {repr(e)}") self.had_internal_error = True self.error_due_to_credentials = True self.internal_error_reason = "AWS credentials are NOT valid!" @@ -1058,6 +1073,7 @@ def __init__(self, s3_bucket_name, s3_file_name, s3_file_name_in_zip, canary_loc self.s3_client = boto3.client("s3") except Exception as e: self.print_message("[S3Monitor] ERROR - (S3 Check) Could not make S3 client") + self.print_message(f"[S3Monitor] ERROR - Exception {repr(e)}") self.had_internal_error = True self.internal_error_reason = "Could not make S3 client for S3 Monitor" return @@ -1085,7 +1101,8 @@ def check_for_file_change(self): return except Exception as e: - self.print_message(f"[S3Monitor] ERROR - Could not check for new version of file in S3 due to exception: {e}") + self.print_message(f"[S3Monitor] ERROR - Could not check for new version of file in S3 due to exception") + self.print_message(f"[S3Monitor] ERROR - Exception {repr(e)}") self.print_message("[S3Monitor] Going to try again later - will not crash Canary") @@ -1096,6 +1113,7 @@ def replace_current_file_for_new_file(self): os.makedirs("tmp") except Exception as e: self.print_message ("[S3Monitor] ERROR - could not make tmp directory to place S3 file into!") + self.print_message(f"[S3Monitor] ERROR - Exception {repr(e)}") self.had_internal_error = True self.internal_error_reason = "Could not make TMP folder for S3 file download" return @@ -1108,6 +1126,7 @@ def replace_current_file_for_new_file(self): s3_resource.meta.client.download_file(self.s3_bucket_name, self.s3_file_name, new_file_path) except Exception as e: self.print_message("[S3Monitor] ERROR - could not download latest S3 file into TMP folder!") + self.print_message(f"[S3Monitor] ERROR - Exception {repr(e)}") self.had_internal_error = True self.internal_error_reason = "Could not download latest S3 file into TMP folder" return @@ -1132,7 +1151,7 @@ def replace_current_file_for_new_file(self): except Exception as e: self.print_message("[S3Monitor] ERROR - could not move file into local application path due to exception!") - self.print_message("[S3Monitor] Exception: " + str(e)) + self.print_message(f"[S3Monitor] ERROR - Exception {repr(e)}") self.had_internal_error = True self.internal_error_reason = "Could not move file into local application path" return @@ -1203,7 +1222,8 @@ def cut_ticket_using_cloudwatch( cloudwatch_client = boto3.client('cloudwatch', cloudwatch_region) ticket_alarm_name = git_repo_name + "-" + git_hash + "-AUTO-TICKET" except Exception as e: - print (f"ERROR - could not create Cloudwatch client to make ticket metric alarm due to exception: {e}", flush=True) + print (f"ERROR - could not create Cloudwatch client to make ticket metric alarm due to exception", flush=True) + print(f"ERROR - Exception {repr(e)}", flush=True) return new_metric_dimensions = [] @@ -1243,7 +1263,8 @@ def cut_ticket_using_cloudwatch( AlarmActions=[ticket_arn] ) except Exception as e: - print (f"ERROR - could not create ticket metric alarm due to exception: {e}", flush=True) + print (f"ERROR - could not create ticket metric alarm due to exception", flush=True) + print(f"ERROR - Exception {repr(e)}", flush=True) return # Trigger the alarm so it cuts the ticket @@ -1253,7 +1274,8 @@ def cut_ticket_using_cloudwatch( StateValue="ALARM", StateReason="AUTO TICKET CUT") except Exception as e: - print (f"ERROR - could not cut ticket due to exception: {e}", flush=True) + print (f"ERROR - could not cut ticket due to exception", flush=True) + print(f"ERROR - Exception {repr(e)}", flush=True) return print("Waiting for ticket metric to trigger...", flush=True) diff --git a/codebuild/CanaryWrapper_MetricFunctions.py b/codebuild/CanaryWrapper_MetricFunctions.py index 05b1934e..d352181a 100644 --- a/codebuild/CanaryWrapper_MetricFunctions.py +++ b/codebuild/CanaryWrapper_MetricFunctions.py @@ -20,7 +20,7 @@ def get_metric_total_cpu_usage(psutil_process : psutil.Process): return psutil.cpu_percent(interval=None) except Exception as e: print ("ERROR - exception occurred gathering metrics!") - print ("Exception: " + str(e), flush=True) + print (f"Exception: {repr(e)}", flush=True) return None # Note: This value is in BYTES. @@ -32,7 +32,7 @@ def get_metric_total_memory_usage_value(psutil_process : psutil.Process): return psutil.virtual_memory()[3] except Exception as e: print ("ERROR - exception occurred gathering metrics!") - print ("Exception: " + str(e), flush=True) + print (f"Exception: {repr(e)}", flush=True) return None @@ -44,5 +44,5 @@ def get_metric_total_memory_usage_percent(psutil_process : psutil.Process): return psutil.virtual_memory()[2] except Exception as e: print ("ERROR - exception occurred gathering metrics!") - print ("Exception: " + str(e), flush=True) + print (f"Exception: {repr(e)}", flush=True) return None