Skip to content

Commit

Permalink
MT: Scrape Votes, NE: Events fix (openstates#5224)
Browse files Browse the repository at this point in the history
MT: Added a vote scraper
NE: Skip event agendas that are blank
  • Loading branch information
showerst authored and Desitrain22 committed Jan 16, 2025
1 parent ea0f846 commit e22588d
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 14 deletions.
127 changes: 120 additions & 7 deletions scrapers/mt/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


class MTBillScraper(Scraper):
TIMEZONE = pytz.timezone("America/Denver")
tz = pytz.timezone("America/Denver")
results_per_page = 100

session_ord = None
Expand All @@ -22,6 +22,10 @@ class MTBillScraper(Scraper):

bill_types = {"B": "bill", "J": "joint resolution", "R": "resolution", "C": "bill"}

# legislator and action lookup tables for populating votes
legislators_by_id = {}
actions_by_id = {}

def scrape(self, session=None):

for i in self.jurisdiction.legislative_sessions:
Expand Down Expand Up @@ -72,6 +76,21 @@ def scrape_legislators(self):
}
)

display_name = " ".join(
filter(
None,
[
legislator["namePrefix"],
legislator["firstName"],
legislator["middleName"],
legislator["lastName"],
legislator["nameSuffix"],
],
)
)

self.legislators_by_id[str(legislator["id"])] = display_name

def scrape_requesting_agencies(self):
self.requesting_agencies = []
url = "https://api.legmt.gov/legislators/v1/organizations"
Expand Down Expand Up @@ -154,7 +173,6 @@ def scrape_list_page(self, session, page_num: int):
# attempt to add a bill relation to the LC/draft version of this bill
bill.add_related_bill(row["draft"]["draftNumber"], session, "replaces")

# TODO votes, used to be processed in actions
self.scrape_actions(bill, row)
self.scrape_extras(bill, row)
self.scrape_subjects(bill, row)
Expand All @@ -181,6 +199,8 @@ def scrape_list_page(self, session, page_num: int):
)

yield bill
yield from self.scrape_votes(bill, str(row["id"]))
yield from self.scrape_committee_votes(bill, str(row["id"]))

if response["totalPages"] > page_num:
yield from self.scrape_list_page(session, page_num + 1)
Expand All @@ -189,7 +209,7 @@ def scrape_actions(self, bill: Bill, row: dict):
for action in row["draft"]["billStatuses"]:
name = action["billStatusCode"]["name"]
when = dateutil.parser.parse(action["timeStamp"])
when = self.TIMEZONE.localize(when)
when = self.tz.localize(when)
if "(H)" in name:
chamber = "lower"
elif "(S)" in name:
Expand All @@ -204,9 +224,8 @@ def scrape_actions(self, bill: Bill, row: dict):
classification=categorize_actions(name),
)

# TODO vote processing
# at this time, no new bills have votes yet
# so we have no idea how data will appear
# we want to be able to look up the action name later for votes
self.actions_by_id[str(action["id"])] = name

def scrape_extras(self, bill: Bill, row: dict):
bill.extras["bill_draft_number"] = row["draft"]["draftNumber"]
Expand Down Expand Up @@ -310,7 +329,7 @@ def scrape_archive_actions(self, bill: Bill, row: dict):
for action in row["billActions"]:
name = action["actionType"]["description"]
when = dateutil.parser.parse(action["date"])
when = self.TIMEZONE.localize(when)
when = self.tz.localize(when)
if "(H)" in name:
chamber = "lower"
elif "(S)" in name:
Expand Down Expand Up @@ -419,3 +438,97 @@ def scrape_lc_versions(self, bill: Bill, lc_number: str):
media_type="application/pdf",
on_duplicate="ignore",
)

def scrape_votes(self, bill: Bill, bill_id: str):
yield from self.scrape_votes_page(
f"https://api.legmt.gov/bills/v1/votes/findByBillId?billId={bill_id}",
bill,
)

def scrape_committee_votes(self, bill: Bill, bill_id: str):
yield from self.scrape_votes_page(
f"https://api.legmt.gov/committees/v1/executiveActions/findByBillId?billId={bill_id}",
bill,
)

# this scrapes both regular and committee votes, which have slightly different json
def scrape_votes_page(self, vote_url: str, bill: Bill):
try:
page = self.get(vote_url).json()
except scrapelib.HTTPError:
# no data = 404 instead of empty json
return

for row in page:
motion = row["motion"]

counts = {"YES": 0, "NO": 0, "ABSENT": 0}
for v in row["legislatorVotes"]:
vote_type_key = "voteType" if "voteType" in v else "committeeVote"
counts[v[vote_type_key]] += 1

passed = counts["YES"] > counts["NO"]

# regular vs committee votes
if "billStatus" in row:
bill_action = self.actions_by_id[str(row["billStatus"]["id"])]
chamber = (
"lower"
if row["billStatus"]["billStatusCode"]["chamber"] == "HOUSE"
else "upper"
)
when = dateutil.parser.parse(row["dateTime"])
elif "standingCommitteeMeeting" in row:
if not row["billStatusId"] or not row["legislatorVotes"]:
# voice vote, skip it there's no data
self.info(f"Skipping voice vote {row['id']}")
continue

chamber = (
"lower"
if row["standingCommitteeMeeting"]["standingCommittee"]["chamber"]
== "HOUSE"
else "upper"
)
bill_action = self.actions_by_id[str(row["billStatusId"])]
when = dateutil.parser.parse(row["voteTime"])

when = self.tz.localize(when)
vote_id = f"{bill.legislative_session}-{bill.identifier}-{str(row['id'])}"

vote = VoteEvent(
identifier=vote_id,
start_date=when,
motion_text=motion,
bill_action=bill_action,
result="pass" if passed else "fail",
chamber=chamber,
bill=bill,
classification=[],
)

vote.set_count("yes", counts["YES"])
vote.set_count("no", counts["NO"])
vote.set_count("absent", counts["NO"])
vote.add_source(bill.sources[0]["url"])

for v in row["legislatorVotes"]:
leg_id = (
v["legislatorId"]
if "legislatorId" in v
else v["membership"]["legislatorId"]
)
voter = self.legislators_by_id[str(leg_id)]
vote_type_key = "voteType" if "voteType" in v else "committeeVote"

if v[vote_type_key] == "YES":
vote.yes(voter)
elif v[vote_type_key] == "NO":
vote.no(voter)
elif v[vote_type_key] == "ABSENT":
vote.vote("absent", voter)
else:
self.error(v)
raise NotImplementedError

yield vote
16 changes: 9 additions & 7 deletions scrapers/ne/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,15 +110,17 @@ def scrape_events(self, page):
desc = re_span.sub(
"", lxml.etree.tostring(details.xpath("div")[2]).decode()
).strip()
agenda_item = event.add_agenda_item(description=desc)

if document not in ["Appointment"]:
bill_id = lxml.html.fromstring(document).text
agenda_item.add_bill(bill_id)
if desc:
agenda_item = event.add_agenda_item(description=desc)

bill_links = row.xpath(".//a[contains(@href, 'view_bill.php')]")
for link in bill_links:
agenda_item.add_bill(link.xpath("text()")[0].strip())
if document not in ["Appointment"]:
bill_id = lxml.html.fromstring(document).text
agenda_item.add_bill(bill_id)

bill_links = row.xpath(".//a[contains(@href, 'view_bill.php')]")
for link in bill_links:
agenda_item.add_bill(link.xpath("text()")[0].strip())

event.add_source("https://nebraskalegislature.gov/calendar/calendar.php")
yield event

0 comments on commit e22588d

Please sign in to comment.