forked from selfhostedshow/show-notes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
145 lines (107 loc) · 4.25 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import concurrent.futures
import os
import html2text
import operator
import requests
from bs4 import BeautifulSoup
from jinja2 import Template
import yaml
from dateutil.parser import parse as date_parse
with open("templates/episode.md.j2") as f:
TEMPLATE = Template(f.read())
def mkdir_safe(directory):
try:
os.mkdir(directory)
except FileExistsError:
pass
def get_list(soup, pre_title):
"""
Blocks of links are preceded by a `p` saying what it is.
"""
pre_element = soup.find("p", string=pre_title)
if pre_element is None:
return None
return pre_element.find_next_sibling("ul")
def get_duration(seconds):
minutes, seconds = divmod(seconds, 60)
return f"{minutes} mins {seconds} secs"
def get_plain_title(title: str):
"""
Get just the show title, without any numbering etc
"""
# Remove number before colon
title = title.split(":", 1)[-1]
# Remove data after the pipe
title = title.rsplit("|", 1)[0]
# Strip any stray spaces
return title.strip()
def create_episode(api_episode, show_config, output_dir):
base_url = show_config['fireside_url']
# RANT: What kind of API doesn't give the episode number?!
episode_number = int(api_episode["url"].split("/")[-1])
episode_number_padded = f"{episode_number:03}"
publish_date = date_parse(api_episode['date_published'])
output_file = f"{output_dir}/{publish_date.year}/episode-{episode_number_padded}.md"
mkdir_safe(f"{output_dir}/{publish_date.year}")
if os.path.isfile(output_file):
print("Skipping", api_episode['url'], "as it already exists")
return
api_soup = BeautifulSoup(api_episode["content_html"], "html.parser")
blurb = api_episode["summary"]
sponsors = html2text.html2text(str(get_list(api_soup, "Sponsored By:")))
links = html2text.html2text(str(get_list(api_soup, "Links:") or get_list(api_soup, "Episode Links:")))
page_soup = BeautifulSoup(requests.get(api_episode["url"]).content, "html.parser")
tags = []
for link in page_soup.find_all("a", class_="tag"):
tags.append(
{"link": base_url + link.get("href"), "text": link.get_text().strip()}
)
# Sort tags by text
tags = sorted(tags, key=operator.itemgetter("text"))
hosts = []
for host in page_soup.find_all("ul", class_="episode-hosts"):
for link in host.find_all("a"):
hosts.append(
{"name": link.get("title"), "link": base_url + link.get("href")}
)
player_embed = page_soup.find("input", class_="copy-share-embed").get("value")
show_attachment = api_episode["attachments"][0]
output = TEMPLATE.render(
{
"title": api_episode["title"],
"title_plain": get_plain_title(api_episode["title"]),
"episode_number": episode_number,
"episode_number_padded": episode_number_padded,
"url": api_episode["url"],
"audio": show_attachment["url"],
"duration": get_duration(int(show_attachment['duration_in_seconds'])),
"blurb": blurb,
"sponsors": sponsors,
"links": links,
"hosts": hosts,
"tags": tags,
"player_embed": player_embed,
"date_published": publish_date.date().isoformat(),
"show_config": show_config
}
)
with open(output_file, "w") as f:
print("Saving", api_episode["url"])
f.write(output)
def main():
# Grab the config embedded in the mkdocs config
with open("mkdocs.yml") as f:
shows = yaml.load(f, Loader=yaml.SafeLoader)['extra']['shows']
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for show_slug, show_config in shows.items():
output_dir = f"docs/{show_slug}"
mkdir_safe(output_dir)
api_data = requests.get(show_config['fireside_url'] + "/json").json()
for api_episode in api_data["items"]:
futures.append(executor.submit(create_episode, api_episode, show_config, output_dir))
# Drain to get exceptions. Still have to mash CTRL-C, though.
for future in concurrent.futures.as_completed(futures):
future.result()
if __name__ == "__main__":
main()