forked from Ecoblockchain/knowledge-map-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.py
executable file
·57 lines (44 loc) · 1.78 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""
This is a Scrapy spider that takes as input a JSON or JS file, with a list
of objects containing the name and map attributes; and scrapes MindNode for
the specified maps.
"""
from os import path, makedirs
import json
import re
import scrapy
def parse_urls():
"""
Return map IDs converted from triggers.js file and convert them to URLs
for fetching JSON data.
"""
with open('triggers.js', 'r') as triggers_file:
data = ''.join(triggers_file.readlines())
# Remove all that is before and after the triggers list.
data = re.sub(r'.*\[', '[', data)
data = re.sub(r'\].*', ']', data)
# Convert IDs to URLs
template = r'https://my.mindnode.com/\g<1>.json'
data = re.sub(r'map: \'(.{40}).*\'', 'map: "{}"'.format(template), data)
# Add quotes to attributes and convert single quotes to double quotes.
data = re.sub(r'(name|map):', r'"\g<1>":', data)
data = re.sub('\'', '"', data)
# Remove trailing comma
data = re.sub(r'},\n]', r'}\n]', data)
return [el['map'] for el in json.loads(data)]
return None
class MindNodeSpider(scrapy.Spider):
"""
MindNode Spider, scrapes maps from MindNode.
"""
name = 'MindNode Spider'
start_urls = parse_urls()
def parse(self, response):
data = json.loads(response.body_as_unicode())
filename = data['title'].replace(' - ', '/').replace(' ', '-')
filename = 'maps/{}.json'.format(filename)
if not path.exists(path.dirname(filename)):
makedirs(path.dirname(filename))
with open(filename, 'w') as map_file:
map_file.write(response.body.replace('https://my.mindnode.com', '/id'))
yield {'id': data['token'], 'title': data['title']}