-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_mars.py
127 lines (104 loc) · 4.17 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
# coding: utf-8
# Mission to Mars - Web Scraping
# Submitted by : Sheetal Bongale | UT Data Analysis and Visualization | March 3, 2020
######################################################################################
import pandas as pd
import pprint
import requests
import urllib.parse
from bs4 import BeautifulSoup as bs
import re
from selenium import webdriver
from splinter import Browser
def scrape():
# URLs to be scraped:
NEWS_URL = "https://mars.nasa.gov/news/"
IMAGE_URL = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
WEATHER_URL = "https://twitter.com/marswxreport?lang=en"
FACTS_URL = "http://space-facts.com/mars/"
HEM_URL = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
#### NASA Mars News ####
# Collect the latest news title from NASA's page and the paragraph teaser text.
# NEWS_URL = "https://mars.nasa.gov/news/"
driver = webdriver.Firefox()
driver.get(NEWS_URL)
html = driver.page_source
soup = bs(html, "html.parser")
news_title = (soup.find("div", class_="list_text")).find("a").text
driver.close()
teaser_url = (
"https://mars.nasa.gov/news/" +
soup.find("div", class_="list_text").a["href"]
)
r = requests.get(teaser_url)
html = r.text
soup = bs(html, "html.parser")
teaser = soup.find("div", class_="wysiwyg_content").find("p").text
#### JPL Mars Space Images - Featured Image ####
# scarpe the JPL web page to scrape the current Featured Mars Image
# IMAGE_URL = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
driver = webdriver.Firefox()
driver.get(IMAGE_URL)
html = driver.page_source
img_soup = bs(html, "html.parser")
img_base_url = img_soup.find(
"article", {"class": "carousel_item"})["style"]
featured_image_url = re.findall(
r"url\((.*?)\)", img_base_url)[0].replace("'", "")
featured_image_url = "https://www.jpl.nasa.gov" + featured_image_url
featured_image_title = img_soup.find(
"h1", class_="media_feature_title"
).text.strip()
driver.close()
#### Mars Weather - Twitter ####
# scrape the latest Mars weather tweet from the given twitter page
# WEATHER_URL = "https://twitter.com/marswxreport?lang=en"
r = requests.get(WEATHER_URL)
html = r.text
weather_soup = bs(html, "html.parser")
mars_weather = weather_soup.find_all(
"div", class_="js-tweet-text-container")
mars_weather = mars_weather[0].text[:-26]
#### Mars Facts ####
# use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
# FACTS_URL = "http://space-facts.com/mars/"
# Use Pandas to read the HTML
fact_table = pd.read_html(FACTS_URL)
mars_fact_table = fact_table[0]
# Convert this facts table to HTML using Pandas
mars_fact_table_html = mars_fact_table.to_html(
header=False, index=False, justify="left"
)
#### Mars Hemispheres ####
# scrape to obtain high resolution images for each of Mar's hemispheres.
# HEM_URL = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser = Browser('firefox')
browser.visit(HEM_URL)
urls = [(a.text, a['href']) for a in browser
.find_by_css('div[class="description"] a')]
hemisphere_dict = []
for title, url in urls:
product_dict = {}
product_dict['title'] = title
browser.visit(url)
img_url = browser.find_by_css('img[class="wide-image"]')['src']
product_dict['hem_img_url'] = img_url
hemisphere_dict.append(product_dict)
browser.quit()
# Store all the scrapped data in a dictionary
mars_dict = {
"news": news_title,
"teaser": teaser,
"featured_image_url": featured_image_url,
"featured_image_title": featured_image_title,
"weather": mars_weather,
"facts": mars_fact_table_html,
"hemispheres": hemisphere_dict,
"news_url": NEWS_URL,
"jpl_url": IMAGE_URL,
"weather_url": WEATHER_URL,
"fact_url": FACTS_URL,
"hemisphere_url": HEM_URL,
}
return mars_dict