-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdouban_FM-Crawler.py
46 lines (40 loc) · 1.52 KB
/
douban_FM-Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#import urllib
import urllib2
import re
class douban_FM:
"This is a python crawler to crawl the popular music on Douban FM."
def __init__(self):
self.url = "http://music.douban.com/musician/"
self.currentPage = 100001
self.thresholdValue = 100
self.txtFile = open('douban_Data.txt', 'w')
def getData(self):
try:
pageInfo = urllib2.urlopen(self.url + str(self.currentPage)).read()
pageInfo = pageInfo.decode("utf-8")
numberMatch = self.currentPage
titleMatch = re.search('<title>(.*?)\\|.*?</title>', pageInfo, re.S)
contentMatch = re.findall(r'<div class="col song-name-short" data-title="(.*?)">.*?<span class="n_doulists unfoldable">(\d+).*?</span>', pageInfo, re.S)
data = {}
for i in contentMatch:
if int(i[1]) >= self.thresholdValue:
data.update({i[0]: i[1]})
if len(data) > 0:
print titleMatch.group(1) + "\t" + str(numberMatch)
#print a
#self.txtFile.write(a + "\n")
for i in data:
print i, "\t", data[i]
#print b
#self.txtFile.write(b + "\n")
except urllib2.URLError as e:
pass
def multiPage(self):
num = 2396
while num <= 10000:
self.currentPage = 100000 + num
self.getData()
num = num + 1
#self.txtFile.close()
douban = douban_FM()
douban.multiPage()