forked from civictechdc/ancfinder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_meeting_database.py
123 lines (106 loc) · 4.95 KB
/
update_meeting_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Scrape the DC ANC website's RSS feed for upcoming ANC meetings.
#
# Store them in ancbrigadesite/static/meetings.json, and update existing meetings
# in place according to the RSS feed's guid field.
import json, os.path, urllib.request, urllib.parse, urllib.error, re
import lxml.etree, lxml.html
from datetime import datetime
output_filename = "ancbrigadesite/static/meetings.json"
if os.path.exists(output_filename):
meetings = json.load(open(output_filename))
else:
meetings = []
# Clear out old entries that don't conform to current data format.
#meetings = [m for m in meetings if "anc" in m]
# Open the RSS feed.
dom = lxml.etree.parse(urllib.request.urlopen("http://anc.dc.gov/node/all/events"))
# In the description HTML, divs have nice class names identifying what
# the information is about. Map CSS classes to field names we'll store
# in our file.
class_name_map = {
"field-name-field-building-name": "location2",
"field-name-field-contact": "contact",
#"field-name-field-date-time-rep": "date-time-rep", # duplicates 'field-type-datetime'
"field-name-field-email": "email",
"field-name-field-external-link-url": "link",
"field-name-field-location": "location",
"field-name-field-phone": "phone",
"field-name-field-second-location": "alternate-location",
"field-name-field-suite-number": "suite-number",
"field-name-field-ward": "ward",
"field-type-datetime": "when",
"field-type-text": "text",
#"field-type-text-long": "text2", # appears to be a broken website link
"field-type-text-with-summary": "about_anc",
}
# The fields in the HTML have label divs and field value divs. Let's
# skip labels when they match a known value.
hide_field_labels = {
"field-name-field-building-name": "Building",
"field-name-field-email": "Email",
"field-name-field-external-link-url": "Website",
"field-name-field-location": "Location",
"field-name-field-phone": "Phone",
"field-name-field-suite-number": "Room",
"field-name-field-ward": "Ward",
"field-type-text": "Building",
"field-type-text-with-summary": "Details",
}
# Take note of meetings already in our JSON file.
meetings_by_guid = { }
for meeting in meetings:
meetings_by_guid[meeting["guid"]] = meeting
# Go through the RSS feed.
for meeting in dom.xpath("channel/item"):
# Assemble meeting information.
meeting_info = {
"title": meeting.xpath("string(title)"),
"calendar_link": meeting.xpath("string(link)"),
"guid": meeting.xpath("string(guid)"),
}
# Replace existing meeting info if it's already in our file, or add it
# if we don't know about it yet.
if meeting_info["guid"] in meetings_by_guid:
meetings_by_guid[meeting_info["guid"]].update(meeting_info)
meeting_info = meetings_by_guid[meeting_info["guid"]]
else:
meetings.append(meeting_info)
# The description tag has HTML content. Parse it.
descr = meeting.xpath("string(description)").replace(" ", "\u00A0")
description = lxml.html.fromstring(descr)
# Loop through the fields we know about and try to extract the values.
for classname, fieldname in list(class_name_map.items()):
# Should we include the field's label in the value we store?
# Get the field value's div elements. There may be more than one.
labels = description.cssselect("." + classname + " .field-label")
if classname in hide_field_labels and len(labels) > 0 and labels[0].text_content().strip() == hide_field_labels[classname]+":":
divs = description.cssselect("." + classname + " .field-items")
else:
divs = description.cssselect("." + classname)
if len(divs) > 0:
value = "; ".join(d.text_content().strip() for d in divs)
meeting_info[fieldname] = value
# Scrape the title field to determine which ANC this is for and what kind of
# meeting it is.
m = re.match(r"ANC (\d[A-Z]) (Meeting|Monthly Meeting|Bimonthly Meetings)$", meeting_info["title"].strip())
if not m:
print(("Unrecognized meeting title format:", meeting_info["title"]))
else:
meeting_info["anc"] = m.group(1)
meeting_info["meeting_type"] = m.group(2)
# Parse the meeting time into the ISO datetime format.
meeting_info["when"] = re.sub(" to \d.*$", "", meeting_info["when"]) # if there's "time X to time Y", chop off the second time
meeting_info["when"] = re.sub(r"Repeats every month\s+on the \S+ (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday) \d+ times\s*. (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)", r"\1", meeting_info["when"])
try:
meeting_info["when"] = datetime.strptime(meeting_info["when"], "%A, %B %d, %Y - %I:%M%p").isoformat()
if "when_unparsed" in meeting_info: del meeting_info["when_unparsed"]
except ValueError:
meeting_info["when_unparsed"] = meeting_info["when"]
del meeting_info["when"]
# Write out the JSON file.
with open(output_filename, "w") as outputfile:
json.dump(meetings, outputfile, sort_keys=True, indent=4)
# Also write out a JSONP file to make embedding in the site easier,
with open(output_filename + "p", "w") as outputfile:
outputfile.write("anc_meetings = \n")
json.dump(meetings, outputfile, sort_keys=True, indent=4)