MT: Scrape Votes, NE: Events fix (openstates#5224)

MT: Added a vote scraper NE: Skip event agendas that are blank
washabstract · Jan 16, 2025 · e22588d · e22588d
1 parent ea0f846
commit e22588d
Show file tree

Hide file tree

Showing 2 changed files with 129 additions and 14 deletions.
diff --git a/scrapers/mt/bills.py b/scrapers/mt/bills.py
@@ -8,7 +8,7 @@
 
 
 class MTBillScraper(Scraper):
-    TIMEZONE = pytz.timezone("America/Denver")
+    tz = pytz.timezone("America/Denver")
     results_per_page = 100
 
     session_ord = None
@@ -22,6 +22,10 @@ class MTBillScraper(Scraper):
 
     bill_types = {"B": "bill", "J": "joint resolution", "R": "resolution", "C": "bill"}
 
+    # legislator and action lookup tables for populating votes
+    legislators_by_id = {}
+    actions_by_id = {}
+
     def scrape(self, session=None):
 
         for i in self.jurisdiction.legislative_sessions:
@@ -72,6 +76,21 @@ def scrape_legislators(self):
                 }
             )
 
+            display_name = " ".join(
+                filter(
+                    None,
+                    [
+                        legislator["namePrefix"],
+                        legislator["firstName"],
+                        legislator["middleName"],
+                        legislator["lastName"],
+                        legislator["nameSuffix"],
+                    ],
+                )
+            )
+
+            self.legislators_by_id[str(legislator["id"])] = display_name
+
     def scrape_requesting_agencies(self):
         self.requesting_agencies = []
         url = "https://api.legmt.gov/legislators/v1/organizations"
@@ -154,7 +173,6 @@ def scrape_list_page(self, session, page_num: int):
                 # attempt to add a bill relation to the LC/draft version of this bill
                 bill.add_related_bill(row["draft"]["draftNumber"], session, "replaces")
 
-            # TODO votes, used to be processed in actions
             self.scrape_actions(bill, row)
             self.scrape_extras(bill, row)
             self.scrape_subjects(bill, row)
@@ -181,6 +199,8 @@ def scrape_list_page(self, session, page_num: int):
                         )
 
             yield bill
+            yield from self.scrape_votes(bill, str(row["id"]))
+            yield from self.scrape_committee_votes(bill, str(row["id"]))
 
         if response["totalPages"] > page_num:
             yield from self.scrape_list_page(session, page_num + 1)
@@ -189,7 +209,7 @@ def scrape_actions(self, bill: Bill, row: dict):
         for action in row["draft"]["billStatuses"]:
             name = action["billStatusCode"]["name"]
             when = dateutil.parser.parse(action["timeStamp"])
-            when = self.TIMEZONE.localize(when)
+            when = self.tz.localize(when)
             if "(H)" in name:
                 chamber = "lower"
             elif "(S)" in name:
@@ -204,9 +224,8 @@ def scrape_actions(self, bill: Bill, row: dict):
                 classification=categorize_actions(name),
             )
 
-            # TODO vote processing
-            # at this time, no new bills have votes yet
-            # so we have no idea how data will appear
+            # we want to be able to look up the action name later for votes
+            self.actions_by_id[str(action["id"])] = name
 
     def scrape_extras(self, bill: Bill, row: dict):
         bill.extras["bill_draft_number"] = row["draft"]["draftNumber"]
@@ -310,7 +329,7 @@ def scrape_archive_actions(self, bill: Bill, row: dict):
         for action in row["billActions"]:
             name = action["actionType"]["description"]
             when = dateutil.parser.parse(action["date"])
-            when = self.TIMEZONE.localize(when)
+            when = self.tz.localize(when)
             if "(H)" in name:
                 chamber = "lower"
             elif "(S)" in name:
@@ -419,3 +438,97 @@ def scrape_lc_versions(self, bill: Bill, lc_number: str):
                 media_type="application/pdf",
                 on_duplicate="ignore",
             )
+
+    def scrape_votes(self, bill: Bill, bill_id: str):
+        yield from self.scrape_votes_page(
+            f"https://api.legmt.gov/bills/v1/votes/findByBillId?billId={bill_id}",
+            bill,
+        )
+
+    def scrape_committee_votes(self, bill: Bill, bill_id: str):
+        yield from self.scrape_votes_page(
+            f"https://api.legmt.gov/committees/v1/executiveActions/findByBillId?billId={bill_id}",
+            bill,
+        )
+
+    # this scrapes both regular and committee votes, which have slightly different json
+    def scrape_votes_page(self, vote_url: str, bill: Bill):
+        try:
+            page = self.get(vote_url).json()
+        except scrapelib.HTTPError:
+            # no data = 404 instead of empty json
+            return
+
+        for row in page:
+            motion = row["motion"]
+
+            counts = {"YES": 0, "NO": 0, "ABSENT": 0}
+            for v in row["legislatorVotes"]:
+                vote_type_key = "voteType" if "voteType" in v else "committeeVote"
+                counts[v[vote_type_key]] += 1
+
+            passed = counts["YES"] > counts["NO"]
+
+            # regular vs committee votes
+            if "billStatus" in row:
+                bill_action = self.actions_by_id[str(row["billStatus"]["id"])]
+                chamber = (
+                    "lower"
+                    if row["billStatus"]["billStatusCode"]["chamber"] == "HOUSE"
+                    else "upper"
+                )
+                when = dateutil.parser.parse(row["dateTime"])
+            elif "standingCommitteeMeeting" in row:
+                if not row["billStatusId"] or not row["legislatorVotes"]:
+                    # voice vote, skip it there's no data
+                    self.info(f"Skipping voice vote {row['id']}")
+                    continue
+
+                chamber = (
+                    "lower"
+                    if row["standingCommitteeMeeting"]["standingCommittee"]["chamber"]
+                    == "HOUSE"
+                    else "upper"
+                )
+                bill_action = self.actions_by_id[str(row["billStatusId"])]
+                when = dateutil.parser.parse(row["voteTime"])
+
+            when = self.tz.localize(when)
+            vote_id = f"{bill.legislative_session}-{bill.identifier}-{str(row['id'])}"
+
+            vote = VoteEvent(
+                identifier=vote_id,
+                start_date=when,
+                motion_text=motion,
+                bill_action=bill_action,
+                result="pass" if passed else "fail",
+                chamber=chamber,
+                bill=bill,
+                classification=[],
+            )
+
+            vote.set_count("yes", counts["YES"])
+            vote.set_count("no", counts["NO"])
+            vote.set_count("absent", counts["NO"])
+            vote.add_source(bill.sources[0]["url"])
+
+            for v in row["legislatorVotes"]:
+                leg_id = (
+                    v["legislatorId"]
+                    if "legislatorId" in v
+                    else v["membership"]["legislatorId"]
+                )
+                voter = self.legislators_by_id[str(leg_id)]
+                vote_type_key = "voteType" if "voteType" in v else "committeeVote"
+
+                if v[vote_type_key] == "YES":
+                    vote.yes(voter)
+                elif v[vote_type_key] == "NO":
+                    vote.no(voter)
+                elif v[vote_type_key] == "ABSENT":
+                    vote.vote("absent", voter)
+                else:
+                    self.error(v)
+                    raise NotImplementedError
+
+            yield vote
diff --git a/scrapers/ne/events.py b/scrapers/ne/events.py
@@ -110,15 +110,17 @@ def scrape_events(self, page):
                 desc = re_span.sub(
                     "", lxml.etree.tostring(details.xpath("div")[2]).decode()
                 ).strip()
-                agenda_item = event.add_agenda_item(description=desc)
 
-                if document not in ["Appointment"]:
-                    bill_id = lxml.html.fromstring(document).text
-                    agenda_item.add_bill(bill_id)
+                if desc:
+                    agenda_item = event.add_agenda_item(description=desc)
 
-                bill_links = row.xpath(".//a[contains(@href, 'view_bill.php')]")
-                for link in bill_links:
-                    agenda_item.add_bill(link.xpath("text()")[0].strip())
+                    if document not in ["Appointment"]:
+                        bill_id = lxml.html.fromstring(document).text
+                        agenda_item.add_bill(bill_id)
+
+                    bill_links = row.xpath(".//a[contains(@href, 'view_bill.php')]")
+                    for link in bill_links:
+                        agenda_item.add_bill(link.xpath("text()")[0].strip())
 
             event.add_source("https://nebraskalegislature.gov/calendar/calendar.php")
             yield event