Refactor analyze_logs to use new runtime line (#526)

ZwickyTransientFacility · Jan 3, 2024 · 6eff864 · 6eff864
1 parent c6e510f
commit 6eff864
Showing 1 changed file with 54 additions and 48 deletions.
diff --git a/tools/analyze_logs.py b/tools/analyze_logs.py
@@ -2,10 +2,11 @@
 import pathlib
 import argparse
 import pandas as pd
-import warnings
 from datetime import timedelta
 import json
 import matplotlib.pyplot as plt
+import os
+from datetime import datetime
 
 BASE_DIR = pathlib.Path(__file__).parent.parent.absolute()
 plt.rcParams["font.size"] = 16
@@ -18,18 +19,13 @@ def get_parser():
     parser.add_argument(
         "--logs-path",
         type=str,
+        default="generated_features_new/logs",
         help="path (from base_dir) to slurm logs",
     )
-    parser.add_argument(
-        "--job-ids-prefix",
-        type=str,
-        default="job_ids",
-        help="path (from base_dir) + prefix of file containing slurm job ids",
-    )
     parser.add_argument(
         "--logs-name-pattern",
         type=str,
-        default="",
+        default="generate_features_new",
         help="common naming convention for slurm logs (e.g. generate_features)",
     )
     parser.add_argument(
@@ -51,58 +47,66 @@ def get_parser():
         help="name of histogram plot (saved in base_dir)",
     )
     parser.add_argument(
-        "--workflow",
+        "--start-date",
         type=str,
-        default="feature_generation",
-        help="name of workflow",
+        default="2023-12-27",
+        help="Earliest date to include in log search [YYYY-MM-DD]",
     )
 
     return parser
 
 
 def main(
-    logs_path,
-    job_ids_prefix="job_ids",
-    logs_name_pattern="",
+    logs_path="generated_features_new/logs",
+    logs_name_pattern="generate_features_new",
     logs_suffix="out",
     output_prefix="runtime_output",
     plot_name="quad_runtime_hist",
-    workflow="feature_generation",
+    start_date="2023-12-27",
 ):
-    job_ids = pd.read_table(BASE_DIR / f"{job_ids_prefix}_{workflow}.txt", header=None)
 
     logs_path = BASE_DIR / logs_path
+    log_files = logs_path.glob(f"{logs_name_pattern}_[0-9]*_[0-9]*.{logs_suffix}")
+    log_files = [x for x in log_files]
 
-    results_dct = {}
-    for id_runtime_pair in job_ids.values:
-        job_id, runtime = id_runtime_pair[0].split(',')
-
-        log_path = logs_path.glob(f"{logs_name_pattern}_{job_id}_*.{logs_suffix}")
-        log_path = [x for x in log_path]
+    if len(log_files) == 0:
+        raise ValueError("Could not find any log files.")
 
-        if len(log_path) > 0:
-            log_output = pd.read_table(log_path[0], header=None)
+    start_date = datetime.strptime(start_date, '%Y-%m-%d')
 
-            n_sources_start = int(log_output.iloc[2].values[0].split()[1])
-            n_sources_end = int(log_output.iloc[-1].values[0].split()[3])
-
-            runtime_split = runtime.split("-")
-            if len(runtime_split) == 1:
-                runtime_days = 0
-                runtime_hms = runtime_split[0].split(":")
-            else:
-                runtime_days = int(runtime_split[0])
-                runtime_hms = runtime_split[1].split(":")
-
-            runtime_hours = int(runtime_hms[0])
-            runtime_minutes = int(runtime_hms[1])
-            runtime_seconds = int(runtime_hms[2])
+    results_dct = {}
+    log_count = 0
+    done_count = 0
+    for log_file in log_files:
+
+        mod_time = os.path.getmtime(log_file)
+        mod_datetime = datetime.utcfromtimestamp(mod_time)
+
+        if mod_datetime > start_date:
+            log_count += 1
+            job_id = str(log_file).split("_")[-2]
+
+            try:
+                log_output = pd.read_table(log_file, header=None)
+            except pd.errors.EmptyDataError:
+                # Some logs may be empty if the instance just began
+                continue
+
+            try:
+                n_sources_start = int(log_output.iloc[2].values[0].split()[1])
+            except IndexError:
+                # Some logs may not yet have initial results if instance just began
+                continue
+
+            try:
+                n_sources_end = int(log_output.iloc[-2].values[0].split()[3])
+                runtime = float(log_output.iloc[-1].values[0].split()[3])
+            except IndexError:
+                # Some logs may not yet have final results if the instance is still running
+                continue
 
             delta = timedelta(
-                days=runtime_days,
-                hours=runtime_hours,
-                minutes=runtime_minutes,
-                seconds=runtime_seconds,
+                seconds=runtime,
             )
             total_seconds = delta.total_seconds()
 
@@ -113,22 +117,24 @@ def main(
                 "seconds_per_source_start": total_seconds / n_sources_start,
             }
 
-        else:
-            warnings.warn(f"Could not find log for job ID {job_id}")
+            done_count += 1
 
+    print(f"Found {log_count} logs modified after {start_date}.")
     # make histogram
     sec_per_lc_start = [x['seconds_per_source_start'] for x in results_dct.values()]
 
     fig = plt.figure(figsize=(7, 7))
     plt.hist(sec_per_lc_start)
     plt.xlabel("Quadrant runtime [sec per lightcurve]")
     plt.ylabel("Count")
-    fig.savefig(BASE_DIR / f"{plot_name}_{workflow}.pdf", bbox_inches='tight')
-    print(f"Saved plot to {BASE_DIR}/{plot_name}_{workflow}.pdf")
+    fig.savefig(BASE_DIR / f"{plot_name}_{logs_name_pattern}.pdf", bbox_inches='tight')
+    print(f"Saved plot to {BASE_DIR}/{plot_name}_{logs_name_pattern}.pdf")
 
-    with open(BASE_DIR / f"{output_prefix}_{workflow}.json", "w") as f:
+    with open(BASE_DIR / f"{output_prefix}_{logs_name_pattern}.json", "w") as f:
         json.dump(results_dct, f)
-    print(f"Wrote results to {BASE_DIR}/{output_prefix}_{workflow}.json")
+    print(
+        f"Wrote results for {done_count} completed jobs to {BASE_DIR}/{output_prefix}_{logs_name_pattern}.json"
+    )
 
 
 if __name__ == "__main__":