Skip to content
This repository has been archived by the owner on Apr 14, 2023. It is now read-only.

Commit

Permalink
Use Jekyll generated files as source for reports
Browse files Browse the repository at this point in the history
Relying on the data in the filesystem is only reliable when working with
a clean slate. However, deleting websites will leave old data behind, so
instead of checking the filesystem, use Jekyll generated files for their
actual purpose: being the authoritative source of sites & reports.

Fixes #57
  • Loading branch information
allejo committed Oct 21, 2017
1 parent 60d487d commit bd29fbd
Showing 1 changed file with 16 additions and 18 deletions.
34 changes: 16 additions & 18 deletions App_Data/jobs/triggered/aggregate/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,20 @@ def sum_data_by_key_file(file_name, group_by, sum_key, keys_to_strip, sort_by):
# Reports that will not be aggregated by this script
ignored_reports = []

# Get all of our agencies and deleted the first item in the list. The first item is a collection of everything in
# the folder and is safe to skip
agencies = [ agency for agency in os.walk(report_folder) ]
del agencies[0]
# Environment variables set during analytics fetching
try:
environment_vars = read_json_file(os.path.join(cwd, 'reports', 'env.json'))
except FileNotFoundError:
print("No environment variables have been defined.")
print("If you're in a dev environment, be sure to build the website first.")
exit()

# The keys for the environment variables contains all of the sites we're actively tracking
agencies = environment_vars.keys()

# Get all of the reports for the smgov website. We will go on the assumption that the 'smgov' website will have all
# of the reports
reports = next(filter(lambda x: x[0] == "data/smgov", agencies))
# Get all of the reports generated by the site
csm_reports = read_json_file(os.path.join(cwd, 'reports', 'csm.json'))
reports = ['{}.json'.format(r['name']) for r in csm_reports['reports']]

# With the aggregation, the sorting is lost, so sort these reports' `data` array by the respective key
sortBy = {
Expand All @@ -162,19 +168,11 @@ def sum_data_by_key_file(file_name, group_by, sum_key, keys_to_strip, sort_by):
'top-pages-30-days.json': ['domain']
}

# Environment variables set during analytics fetching
try:
environment_vars = read_json_file(os.path.join(cwd, 'reports', 'env.json'))
except FileNotFoundError:
print("No environment variables have been defined. If you're in a dev environment, be sure to build the website first")
exit()

#
# Aggregate all of the reports
#

# reports[2] is where all of the report file names are stored
for report in reports[2]:
for report in reports:
if not report.endswith('.json') or report in ignored_reports:
continue

Expand All @@ -183,8 +181,8 @@ def sum_data_by_key_file(file_name, group_by, sum_key, keys_to_strip, sort_by):

for agency in agencies:
# agency[0] is the path to the agency
report_file = os.path.join(agency[0], report)
agency_name = os.path.basename(agency[0])
report_file = os.path.join('data', agency, report)
agency_name = os.path.basename(agency)

try:
with open(report_file, 'r+', encoding='utf8') as file_content:
Expand Down

0 comments on commit bd29fbd

Please sign in to comment.