From e22588deebd85560c896dbdbbd9473500c15415b Mon Sep 17 00:00:00 2001 From: showerst Date: Wed, 15 Jan 2025 23:31:48 -0500 Subject: [PATCH] MT: Scrape Votes, NE: Events fix (#5224) MT: Added a vote scraper NE: Skip event agendas that are blank --- scrapers/mt/bills.py | 127 +++++++++++++++++++++++++++++++++++++++--- scrapers/ne/events.py | 16 +++--- 2 files changed, 129 insertions(+), 14 deletions(-) diff --git a/scrapers/mt/bills.py b/scrapers/mt/bills.py index 794c470784..f28a680ca4 100644 --- a/scrapers/mt/bills.py +++ b/scrapers/mt/bills.py @@ -8,7 +8,7 @@ class MTBillScraper(Scraper): - TIMEZONE = pytz.timezone("America/Denver") + tz = pytz.timezone("America/Denver") results_per_page = 100 session_ord = None @@ -22,6 +22,10 @@ class MTBillScraper(Scraper): bill_types = {"B": "bill", "J": "joint resolution", "R": "resolution", "C": "bill"} + # legislator and action lookup tables for populating votes + legislators_by_id = {} + actions_by_id = {} + def scrape(self, session=None): for i in self.jurisdiction.legislative_sessions: @@ -72,6 +76,21 @@ def scrape_legislators(self): } ) + display_name = " ".join( + filter( + None, + [ + legislator["namePrefix"], + legislator["firstName"], + legislator["middleName"], + legislator["lastName"], + legislator["nameSuffix"], + ], + ) + ) + + self.legislators_by_id[str(legislator["id"])] = display_name + def scrape_requesting_agencies(self): self.requesting_agencies = [] url = "https://api.legmt.gov/legislators/v1/organizations" @@ -154,7 +173,6 @@ def scrape_list_page(self, session, page_num: int): # attempt to add a bill relation to the LC/draft version of this bill bill.add_related_bill(row["draft"]["draftNumber"], session, "replaces") - # TODO votes, used to be processed in actions self.scrape_actions(bill, row) self.scrape_extras(bill, row) self.scrape_subjects(bill, row) @@ -181,6 +199,8 @@ def scrape_list_page(self, session, page_num: int): ) yield bill + yield from self.scrape_votes(bill, str(row["id"])) + yield from self.scrape_committee_votes(bill, str(row["id"])) if response["totalPages"] > page_num: yield from self.scrape_list_page(session, page_num + 1) @@ -189,7 +209,7 @@ def scrape_actions(self, bill: Bill, row: dict): for action in row["draft"]["billStatuses"]: name = action["billStatusCode"]["name"] when = dateutil.parser.parse(action["timeStamp"]) - when = self.TIMEZONE.localize(when) + when = self.tz.localize(when) if "(H)" in name: chamber = "lower" elif "(S)" in name: @@ -204,9 +224,8 @@ def scrape_actions(self, bill: Bill, row: dict): classification=categorize_actions(name), ) - # TODO vote processing - # at this time, no new bills have votes yet - # so we have no idea how data will appear + # we want to be able to look up the action name later for votes + self.actions_by_id[str(action["id"])] = name def scrape_extras(self, bill: Bill, row: dict): bill.extras["bill_draft_number"] = row["draft"]["draftNumber"] @@ -310,7 +329,7 @@ def scrape_archive_actions(self, bill: Bill, row: dict): for action in row["billActions"]: name = action["actionType"]["description"] when = dateutil.parser.parse(action["date"]) - when = self.TIMEZONE.localize(when) + when = self.tz.localize(when) if "(H)" in name: chamber = "lower" elif "(S)" in name: @@ -419,3 +438,97 @@ def scrape_lc_versions(self, bill: Bill, lc_number: str): media_type="application/pdf", on_duplicate="ignore", ) + + def scrape_votes(self, bill: Bill, bill_id: str): + yield from self.scrape_votes_page( + f"https://api.legmt.gov/bills/v1/votes/findByBillId?billId={bill_id}", + bill, + ) + + def scrape_committee_votes(self, bill: Bill, bill_id: str): + yield from self.scrape_votes_page( + f"https://api.legmt.gov/committees/v1/executiveActions/findByBillId?billId={bill_id}", + bill, + ) + + # this scrapes both regular and committee votes, which have slightly different json + def scrape_votes_page(self, vote_url: str, bill: Bill): + try: + page = self.get(vote_url).json() + except scrapelib.HTTPError: + # no data = 404 instead of empty json + return + + for row in page: + motion = row["motion"] + + counts = {"YES": 0, "NO": 0, "ABSENT": 0} + for v in row["legislatorVotes"]: + vote_type_key = "voteType" if "voteType" in v else "committeeVote" + counts[v[vote_type_key]] += 1 + + passed = counts["YES"] > counts["NO"] + + # regular vs committee votes + if "billStatus" in row: + bill_action = self.actions_by_id[str(row["billStatus"]["id"])] + chamber = ( + "lower" + if row["billStatus"]["billStatusCode"]["chamber"] == "HOUSE" + else "upper" + ) + when = dateutil.parser.parse(row["dateTime"]) + elif "standingCommitteeMeeting" in row: + if not row["billStatusId"] or not row["legislatorVotes"]: + # voice vote, skip it there's no data + self.info(f"Skipping voice vote {row['id']}") + continue + + chamber = ( + "lower" + if row["standingCommitteeMeeting"]["standingCommittee"]["chamber"] + == "HOUSE" + else "upper" + ) + bill_action = self.actions_by_id[str(row["billStatusId"])] + when = dateutil.parser.parse(row["voteTime"]) + + when = self.tz.localize(when) + vote_id = f"{bill.legislative_session}-{bill.identifier}-{str(row['id'])}" + + vote = VoteEvent( + identifier=vote_id, + start_date=when, + motion_text=motion, + bill_action=bill_action, + result="pass" if passed else "fail", + chamber=chamber, + bill=bill, + classification=[], + ) + + vote.set_count("yes", counts["YES"]) + vote.set_count("no", counts["NO"]) + vote.set_count("absent", counts["NO"]) + vote.add_source(bill.sources[0]["url"]) + + for v in row["legislatorVotes"]: + leg_id = ( + v["legislatorId"] + if "legislatorId" in v + else v["membership"]["legislatorId"] + ) + voter = self.legislators_by_id[str(leg_id)] + vote_type_key = "voteType" if "voteType" in v else "committeeVote" + + if v[vote_type_key] == "YES": + vote.yes(voter) + elif v[vote_type_key] == "NO": + vote.no(voter) + elif v[vote_type_key] == "ABSENT": + vote.vote("absent", voter) + else: + self.error(v) + raise NotImplementedError + + yield vote diff --git a/scrapers/ne/events.py b/scrapers/ne/events.py index fa245110dd..0df55d7f1b 100644 --- a/scrapers/ne/events.py +++ b/scrapers/ne/events.py @@ -110,15 +110,17 @@ def scrape_events(self, page): desc = re_span.sub( "", lxml.etree.tostring(details.xpath("div")[2]).decode() ).strip() - agenda_item = event.add_agenda_item(description=desc) - if document not in ["Appointment"]: - bill_id = lxml.html.fromstring(document).text - agenda_item.add_bill(bill_id) + if desc: + agenda_item = event.add_agenda_item(description=desc) - bill_links = row.xpath(".//a[contains(@href, 'view_bill.php')]") - for link in bill_links: - agenda_item.add_bill(link.xpath("text()")[0].strip()) + if document not in ["Appointment"]: + bill_id = lxml.html.fromstring(document).text + agenda_item.add_bill(bill_id) + + bill_links = row.xpath(".//a[contains(@href, 'view_bill.php')]") + for link in bill_links: + agenda_item.add_bill(link.xpath("text()")[0].strip()) event.add_source("https://nebraskalegislature.gov/calendar/calendar.php") yield event