Skip to content

Commit

Permalink
PR: fix duplicate bill import issue
Browse files Browse the repository at this point in the history
  • Loading branch information
jessemortenson committed Jan 13, 2025
1 parent d1b9d2e commit 13c8e12
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion scrapers/pr/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def clean_name(self, name):
def scrape(self, session=None, chamber=None, page=None):
self.seen_votes = set()
self.seen_bills = set()
self.seen_bill_identifiers = set()
chambers = [chamber] if chamber is not None else ["upper", "lower"]
for chamber in chambers:
yield from self.scrape_search_results(
Expand Down Expand Up @@ -339,7 +340,16 @@ def scrape_bill(self, chamber, session, url):
if title:
bill_id = re.findall(r"[A-Z]{2}\d{4}", title)[0]
else:
bill_id = ""
self.logger.error(f"Bill found with no bill identifier at {url}")

# PR occasionally repeats a bill at different URLs (????)
# example:
# PC0205 https://sutra.oslpr.org/medidas/152982
# PC0205 https://sutra.oslpr.org/medidas/152909
if bill_id in self.seen_bill_identifiers:
return
else:
self.seen_bill_identifiers.add(bill_id)

bill_type = self.classify_bill_type(bill_id)

Expand Down

0 comments on commit 13c8e12

Please sign in to comment.