-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathucsb_scraper.py
140 lines (112 loc) · 4.79 KB
/
ucsb_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# Spider for the UCSB American Presidency Project
# (https://www.presidency.ucsb.edu/)'s list of
# SAPs.
import re
import datetime
import scrapy
from bs4 import BeautifulSoup as bs4
import rtyaml
import dateutil.parser
now = datetime.datetime.now().isoformat()
Presidents = {
# Skip presidents that we pull(ed) ourselves from authoritative sources.
"Joseph R. Biden": None,
"Donald J. Trump": None,
"Barack Obama": None,
# Map president names to filenames.
"George W. Bush": "43-Bush",
"William J. Clinton": "42-Clinton",
"George Bush": "41-Bush",
"Ronald Reagan": "40-Reagan",
}
class UCSBSpider(scrapy.Spider):
name = "sap-uscb"
allowed_domains = ['www.presidency.ucsb.edu']
start_urls = ['https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/statements-administration-policy-reagan-1985']
custom_settings = {
'ITEM_PIPELINES': {
'ucsb_scraper.UCSBPipeline': 300,
},
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 543,
},
'HTTPCACHE_ENABLED': True, # not sure if this is working
}
def parse(self, response):
soup = bs4(response.text, 'html.parser')
rows = soup.find('table', {'class': 'views-table'})\
.findAll('tr')[1:] # skip header
for item in rows:
bill_numbers = item.findAll('td')[0].text.strip()
if not bill_numbers:
bill_numbers = [] # no info on related bill numbers
else:
bill_numbers = [re.sub("-amend|senateamendmentto|conferencereportto|conferencereporton|confreporton|amendto", "",
re.sub(r"[\s\.]|\(.*?\)", "", b.lower()))
for b in re.split(r"[,&]| and ", bill_numbers)]
bill_numbers = [b for b in bill_numbers if b.strip() != "" and "_" not in b]
president = item.findAll('td')[1].text.strip()
date_issued = item.findAll('td')[2].text
date_issued = dateutil.parser.parse(date_issued).date().isoformat()
title = item.findAll('td')[3].text.strip()
if title.startswith("Statement of Administration Policy: "):
title = title[len("Statement of Administration Policy: "):]
link = item.find('a', href=True)['href']
yield {
#'original_bills': item.findAll('td')[0].text.strip(),
'president': president,
'bills': bill_numbers,
'document_title': title,
'congress': self.get_congress_number(date_issued[:4]),
'date_issued': date_issued,
'url': link,
'date_fetched': now,
'source': self.start_urls[0],
}
# Scrape next page.
a_next = soup.find('a', {'title': 'Go to next page'}, href=True)
if a_next and "page=" in a_next["href"]:
yield scrapy.Request(a_next["href"], self.parse)
def get_congress_number(self, year):
# This is not quite right but the edge cases of SAPs
# issued between Jan 1 and Jan 3 at noon of odd years,
# which will be in the previous Congres, hopefully can be ignored.
congress = 0
const_year = 2022
const_congress = 117
dif = const_year - int(year)
congress = const_congress - (dif // 2)
return congress
@classmethod
def update_settings(cls, settings):
settings.setdict(cls.custom_settings or {}, priority='spider')
class UCSBPipeline:
@classmethod
def from_crawler(cls, crawler):
return cls()
def process_item(self, item, spider):
# Add metadata to output document.
president = Presidents[item['president']]
del item['president']
if president:
self.data.setdefault(president, [])\
.append(item)
def open_spider(self, spider):
# Collect scraped data here, organized by president.
self.data = { }
def close_spider(self, spider):
# Write all data to YAML files by president.
for admincode, saps in self.data.items():
fn = "archive/" + admincode + ".yaml"
# # Restore the existing date_fetched so that on re-scans
# # we don't cause a change throughout the data files.
# import os.path
# if os.path.exists(fn):
# date_fetched = { }
# with open(fn) as f:
# for item in rtyaml.load(f):
# date_fetched[item["url"]] = item["date_fetched"]
# for item in saps:
# item["date_fetched"] = date_fetched[item["url"]]
with open(fn, "w") as f:
rtyaml.dump(saps, f)