Skip to content

Commit

Permalink
Merge pull request #223 from kevinschaul/pictorial-api
Browse files Browse the repository at this point in the history
  • Loading branch information
hugovk authored Jan 3, 2025
2 parents ff693f6 + c570029 commit 5e7a9c1
Show file tree
Hide file tree
Showing 116 changed files with 147 additions and 53 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ var/
*.egg-info/
.installed.cfg
*.egg
.python-version

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down Expand Up @@ -73,3 +74,5 @@ congress/metadata/A000000.yaml
# Thumbnail files
.DS_Store
thumbs.db

errors.json
Binary file added congress/225x275/A000380.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/D000230.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/D000530.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/D000594.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/D000633.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/F000477.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/F000479.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/G000599.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/G000600.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/H001094.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/H001096.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/K000398.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/K000399.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/L000597.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/L000600.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/M001219.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/M001221.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/M001227.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/M001228.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/P000619.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/R000617.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/S001220.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/S001221.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/S001223.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/T000487.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added congress/225x275/V000135.jpg
Binary file added congress/450x550/A000380.jpg
Binary file added congress/450x550/D000230.jpg
Binary file added congress/450x550/D000530.jpg
Binary file added congress/450x550/D000594.jpg
Binary file added congress/450x550/D000633.jpg
Binary file added congress/450x550/F000477.jpg
Binary file added congress/450x550/F000479.jpg
Binary file added congress/450x550/G000599.jpg
Binary file added congress/450x550/G000600.jpg
Binary file added congress/450x550/H001094.jpg
Binary file added congress/450x550/H001096.jpg
Binary file added congress/450x550/K000398.jpg
Binary file added congress/450x550/K000399.jpg
Binary file added congress/450x550/L000597.jpg
Binary file added congress/450x550/L000600.jpg
Binary file added congress/450x550/M001219.jpg
Binary file added congress/450x550/M001221.jpg
Binary file added congress/450x550/M001227.jpg
Binary file added congress/450x550/M001228.jpg
Binary file added congress/450x550/P000619.jpg
Binary file added congress/450x550/R000617.jpg
Binary file added congress/450x550/S001220.jpg
Binary file added congress/450x550/S001221.jpg
Binary file added congress/450x550/S001223.jpg
Binary file added congress/450x550/T000487.jpg
Binary file added congress/450x550/V000135.jpg
2 changes: 2 additions & 0 deletions congress/metadata/A000380.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/B001319.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/B001320.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/D000230.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/D000530.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/D000594.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/D000633.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/F000477.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/F000479.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/G000599.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/G000600.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/H001094.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/H001096.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/J000307.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/J000309.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/K000398.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/K000399.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/L000597.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/L000599.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/L000600.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/L000601.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/L000602.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/M001217.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/M001219.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/M001221.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/M001222.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/M001227.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/M001228.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/P000619.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/R000617.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/S001220.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/S001221.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/S001223.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/T000487.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/V000135.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
2 changes: 2 additions & 0 deletions congress/metadata/V000137.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: GPO Member Guide
link: https://pictorial.gpo.gov
Binary file added congress/original/A000380.jpg
Binary file added congress/original/D000230.jpg
Binary file added congress/original/D000530.jpg
Binary file added congress/original/D000594.jpg
Binary file added congress/original/D000633.jpg
Binary file added congress/original/F000477.jpg
Binary file added congress/original/F000479.jpg
Binary file added congress/original/G000599.jpg
Binary file added congress/original/G000600.jpg
Binary file added congress/original/H001094.jpg
Binary file added congress/original/H001096.jpg
Binary file added congress/original/K000398.jpg
Binary file added congress/original/K000399.jpg
Binary file added congress/original/L000597.jpg
Binary file added congress/original/L000600.jpg
Binary file added congress/original/M001219.jpg
Binary file added congress/original/M001221.jpg
Binary file added congress/original/M001227.jpg
Binary file added congress/original/M001228.jpg
Binary file added congress/original/P000619.jpg
Binary file added congress/original/R000617.jpg
Binary file added congress/original/S001220.jpg
Binary file added congress/original/S001221.jpg
Binary file added congress/original/S001223.jpg
Binary file added congress/original/T000487.jpg
Binary file added congress/original/V000135.jpg
125 changes: 72 additions & 53 deletions scripts/gpo_member_photos.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
import json
import os
import re
import sys
import time
from urllib.error import HTTPError
from urllib.parse import urlencode
from urllib.request import urlretrieve

# pip install -r requirements.txt
import mechanicalsoup

CURRENT_CONGRESS = 118

USER_AGENT = (
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
Expand Down Expand Up @@ -44,50 +44,38 @@ def pause(last, delay):
return datetime.datetime.now()


def get_photo_list(br, congress_number, delay):
last_request_time = None

page = 1
while True:
# Fetch a page of results from Congress.gov.
print(f"Page {page} of Congress.gov Member listing...")
response = br.get(
"https://www.congress.gov/search?"
+ urlencode(
{
"q": json.dumps(
{"source": "members", "congress": str(congress_number)}
),
"pageSize": 250,
"page": page,
}
)
).text

if len(response) == 0:
sys.exit("Page is blank. Try again later, you may have hit a limit.")

# Scan for links to Member pages and img tags. The link to the
# Congress.gov page uses the Member's Bioguide ID as the key, and the
# filename for the photo is the same file name found at
# memberguide.gpo.gov for the high-resolution file.
for bioguide_id, photo_file in regex1.findall(response):
# this part is added by Congress.gov:
photo_file = photo_file.replace("_200.jpg", ".jpg")
if photo_file == bioguide_id.lower() + ".jpg":
continue # not a file sourced from GPO
yield (bioguide_id, photo_file)

m = regex2.search(response)
if m:
# fetch next page of results
page += 1
continue
else:
# this was the last page (no Next link)
break

last_request_time = pause(last_request_time, delay)
def get_members_pictorial(br, congress_number):
"""
Get members for the given congress_number
API documentation: https://pictorialapi.gpo.gov/index.html
"""
response = br.get(
f"https://pictorialapi.gpo.gov/api/GuideMember/GetMembers/{congress_number}"
).json()
return [
member
for member in response["memberCollection"]
if member["memberType"] in ("Senator", "Representative")
and member["name"] != "Vacant, Vacant"
]


def get_legislators_current(br, include_historical=False):
"""
Download legislators from sister project unitedstates/congress-legislators
Optionally also include historical legislators (which significantly
increases the download size)
"""
legislators = br.get(
"https://theunitedstates.io/congress-legislators/legislators-current.json"
).json()
if include_historical:
historical = br.get(
"https://theunitedstates.io/congress-legislators/"
"legislators-historical.json"
).json()
legislators += historical
return legislators


def save_metadata(bioguide_id):
Expand All @@ -97,7 +85,7 @@ def save_metadata(bioguide_id):
outfile = os.path.join(outdir, bioguide_id + ".yaml")
with open(outfile, "w") as f:
f.write("name: GPO Member Guide\n")
f.write("link: https://memberguide.gpo.gov\n")
f.write("link: https://pictorial.gpo.gov\n")


def download_file(url, outfile):
Expand All @@ -120,8 +108,7 @@ def download_photos(br, photo_list, outdir, delay):

ok = 0

for bioguide_id, photo_filename in photo_list:
photo_url = "https://memberguide.gpo.gov/PictorialImages/" + photo_filename
for bioguide_id, photo_url in photo_list:
print(bioguide_id, photo_url)

filename = os.path.join(outdir, bioguide_id + ".jpg")
Expand All @@ -148,14 +135,15 @@ def resize_photos():

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Scrape https://memberguide.gpo.gov and save "
"members' photos named after their Bioguide IDs",
description="Save members' photos from pictorialapi.gpo.gov, named "
"after their Bioguide IDs",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"-n",
"--congress",
default="114",
default=CURRENT_CONGRESS,
type=int,
help="Congress session number, for example: 110, 111, 112, 113",
)
parser.add_argument(
Expand Down Expand Up @@ -183,11 +171,42 @@ def resize_photos():
br = mechanicalsoup.Browser()
br.set_user_agent(USER_AGENT)

photo_list = get_photo_list(br, args.congress, args.delay)
legislators_current = get_legislators_current(br, args.congress != CURRENT_CONGRESS)
members_pictorial = get_members_pictorial(br, args.congress)

photo_list = []
errors = []
for m in legislators_current:
image_found = False
if "pictorial" in m["id"]:
try:
pictorial_data = next(
p
for p in members_pictorial
if p["memberId"] == m["id"]["pictorial"]
)

if "nophotoimage.jpg" in pictorial_data["imageUrl"]:
pass
else:
image_found = True
photo_list.append((m["id"]["bioguide"], pictorial_data["imageUrl"]))
except StopIteration:
# No matching result from pictorial API
pass

if not image_found:
print(f"No photo available for {m['id']['bioguide']}")
errors.append(["No photo available", m["id"]["bioguide"], m["name"]])

number = download_photos(br, photo_list, args.outdir, args.delay)

if number:
resize_photos()

if len(errors):
print(f"{len(errors)} legislators had errors. Details wrote to errors.json")
with open("errors.json", "w") as f:
json.dump(errors, f, indent=2)

# End of file

0 comments on commit 5e7a9c1

Please sign in to comment.