Use Jekyll generated files as source for reports

Relying on the data in the filesystem is only reliable when working with a clean slate. However, deleting websites will leave old data behind, so instead of checking the filesystem, use Jekyll generated files for their actual purpose: being the authoritative source of sites & reports. Fixes #57
CityofSantaMonica · Oct 21, 2017 · bd29fbd · bd29fbd
1 parent 60d487d
commit bd29fbd
Showing 1 changed file with 16 additions and 18 deletions.
diff --git a/App_Data/jobs/triggered/aggregate/job.py b/App_Data/jobs/triggered/aggregate/job.py
@@ -133,14 +133,20 @@ def sum_data_by_key_file(file_name, group_by, sum_key, keys_to_strip, sort_by):
     # Reports that will not be aggregated by this script
     ignored_reports = []
 
-    # Get all of our agencies and deleted the first item in the list. The first item is a collection of everything in
-    # the folder and is safe to skip
-    agencies = [ agency for agency in os.walk(report_folder) ]
-    del agencies[0]
+    # Environment variables set during analytics fetching
+    try:
+        environment_vars = read_json_file(os.path.join(cwd, 'reports', 'env.json'))
+    except FileNotFoundError:
+        print("No environment variables have been defined.")
+        print("If you're in a dev environment, be sure to build the website first.")
+        exit()
+
+    # The keys for the environment variables contains all of the sites we're actively tracking
+    agencies = environment_vars.keys()
 
-    # Get all of the reports for the smgov website. We will go on the assumption that the 'smgov' website will have all
-    # of the reports
-    reports = next(filter(lambda x: x[0] == "data/smgov", agencies))
+    # Get all of the reports generated by the site
+    csm_reports = read_json_file(os.path.join(cwd, 'reports', 'csm.json'))
+    reports = ['{}.json'.format(r['name']) for r in csm_reports['reports']]
 
     # With the aggregation, the sorting is lost, so sort these reports' `data` array by the respective key
     sortBy = {
@@ -162,19 +168,11 @@ def sum_data_by_key_file(file_name, group_by, sum_key, keys_to_strip, sort_by):
         'top-pages-30-days.json': ['domain']
     }
 
-    # Environment variables set during analytics fetching
-    try:
-        environment_vars = read_json_file(os.path.join(cwd, 'reports', 'env.json'))
-    except FileNotFoundError:
-        print("No environment variables have been defined. If you're in a dev environment, be sure to build the website first")
-        exit()
-
     #
     # Aggregate all of the reports
     #
 
-    # reports[2] is where all of the report file names are stored
-    for report in reports[2]:
+    for report in reports:
         if not report.endswith('.json') or report in ignored_reports:
             continue
 
@@ -183,8 +181,8 @@ def sum_data_by_key_file(file_name, group_by, sum_key, keys_to_strip, sort_by):
 
         for agency in agencies:
             # agency[0] is the path to the agency
-            report_file = os.path.join(agency[0], report)
-            agency_name = os.path.basename(agency[0])
+            report_file = os.path.join('data', agency, report)
+            agency_name = os.path.basename(agency)
 
             try:
                 with open(report_file, 'r+', encoding='utf8') as file_content: