-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimages.py
75 lines (56 loc) · 2.44 KB
/
images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import sys, os, re, requests, urllib
from lxml.html import fromstring
GALLERY_OVERVIEW_URL = "http://www.arto.com/section/user/profile/gallery/?id=%d"
GALLERY_URL = "http://www.arto.com/section/user/profile/gallery/?id=%d&category=%d&ContentList_ActivePage=%d"
def get_galleries(user_id, root, session):
req = session.get(GALLERY_OVERVIEW_URL % user_id)
dom = fromstring(req.content)
folder = root + "images/"
os.makedirs(folder, exist_ok=True)
fp = open(folder + "overview.html", 'wb')
fp.write(req.content)
fp.close()
# no classes or ids on the tags. Matching on href #yolo
for elm in dom.xpath(".//a[starts-with(@href,'/section/user/profile/gallery/?id=%d&category=')]" % user_id):
href = elm.get('href')
gal_id = int(href.split("=")[-1])
if gal_id < 0:
continue
if elm.find('img') is not None:
continue
yield gal_id, elm.text
def scrape_gallery(gallery_id, user_id, root, session):
page_counter = 1
image_counter = 1
should_continue = True
while (should_continue):
# scrape overview page(s)
print(" - page %d" % page_counter)
req = session.get(GALLERY_URL % (user_id, gallery_id, page_counter))
dom = fromstring(req.content)
folder = root + "images/" + str(gallery_id) + "/"
os.makedirs(folder, exist_ok=True)
fp = open(folder + "gallery-page%d.html" % page_counter, 'wb')
fp.write(req.content)
fp.close()
img_elements = dom.xpath(".//img[contains(@src,'/data/user/gallery')]")
for img_elm in img_elements:
src = img_elm.get('src')
print(" - image %d" % image_counter)
img_data = session.get(src.replace('thumbs', 'images'))
image_fp = open(folder + str(image_counter) + ".jpg", 'wb')
image_fp.write(img_data.content)
image_fp.close()
image_counter += 1
e = dom.xpath('.//a[text()="Næste"]')
page_counter += 1
if not e:
should_continue = False
def scrape_images(user_id, root, session):
print(" - images")
for gallery_id, gallery_name in get_galleries(user_id, root, session):
if gallery_name is not None:
print(" - gallery: " + gallery_name)
else:
print(" - gallery")
scrape_gallery(gallery_id, user_id, root, session)