Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

getter引入协程,tester中aiohttp的session优化 #92

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 26 additions & 17 deletions proxypool/crawlers/base.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,41 @@
import asyncio
import aiohttp
from retrying import retry
import requests
from loguru import logger
from proxypool.setting import GET_TIMEOUT


class BaseCrawler(object):
urls = []


def __init__(self):
self.loop = asyncio.get_event_loop()

@retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000)
def fetch(self, url, **kwargs):
async def fetch(self, session, url, **kwargs):
try:
kwargs.setdefault('timeout', GET_TIMEOUT)
kwargs.setdefault('verify', False)
response = requests.get(url, **kwargs)
if response.status_code == 200:
response.encoding = 'utf-8'
return response.text
except requests.ConnectionError:
async with session.get(url, **kwargs) as response:
if response.status == 200:
response.encoding = 'utf-8'
return await response.text()
except aiohttp.ClientConnectionError:
return

@logger.catch
def crawl(self):
async def crawl(self):
"""
crawl main method
"""
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url)
for proxy in self.parse(html):
logger.info(f'fetched proxy {proxy.string()} from {url}')
yield proxy
proxies = []
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
tasks = [self.fetch(session, url) for url in self.urls]
results = await asyncio.gather(*tasks)
for result in results:
if result:
for proxy in self.parse(result):
proxies.append(proxy)
return proxies

def run(self):
return self.loop.run_until_complete(self.crawl())
2 changes: 1 addition & 1 deletion proxypool/crawlers/public/daili66.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ def parse(self, html):

if __name__ == '__main__':
crawler = Daili66Crawler()
for proxy in crawler.crawl():
for proxy in crawler.run():
print(proxy)
28 changes: 15 additions & 13 deletions proxypool/crawlers/public/data5u.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import asyncio
import aiohttp
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
Expand All @@ -11,23 +13,23 @@ class Data5UCrawler(BaseCrawler):
data5u crawler, http://www.data5u.com
"""
urls = [BASE_URL]

headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}

@logger.catch
def crawl(self):
"""
crawl main method
"""
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url, headers=self.headers)
for proxy in self.parse(html):
logger.info(f'fetched proxy {proxy.string()} from {url}')
yield proxy
async def crawl(self):
proxies = []
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
tasks = [self.fetch(session, url, headers=self.headers) for url in self.urls]
results = await asyncio.gather(*tasks)
for result in results:
if result:
for proxy in self.parse(result):
proxies.append(proxy)
return proxies

def parse(self, html):
"""
parse html file to get proxies
Expand All @@ -43,5 +45,5 @@ def parse(self, html):

if __name__ == '__main__':
crawler = Data5UCrawler()
for proxy in crawler.crawl():
for proxy in crawler.run():
print(proxy)
2 changes: 1 addition & 1 deletion proxypool/crawlers/public/ip3366.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ def parse(self, html):

if __name__ == '__main__':
crawler = IP3366Crawler()
for proxy in crawler.crawl():
for proxy in crawler.run():
print(proxy)
5 changes: 3 additions & 2 deletions proxypool/crawlers/public/iphai.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

BASE_URL = 'http://www.iphai.com/'


class IPHaiCrawler(BaseCrawler):
"""
iphai crawler, http://www.iphai.com/
Expand All @@ -28,8 +29,8 @@ def parse(self, html):
proxy = Proxy(host=address.strip(), port=int(port.strip()))
yield proxy


if __name__ == '__main__':
crawler = IPHaiCrawler()
for proxy in crawler.crawl():
for proxy in crawler.run():
print(proxy)

2 changes: 1 addition & 1 deletion proxypool/crawlers/public/kuaidaili.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ def parse(self, html):

if __name__ == '__main__':
crawler = KuaidailiCrawler()
for proxy in crawler.crawl():
for proxy in crawler.run():
print(proxy)
29 changes: 16 additions & 13 deletions proxypool/crawlers/public/xicidaili.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import asyncio
import aiohttp
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from loguru import logger


BASE_URL = 'https://www.xicidaili.com/'


Expand All @@ -18,17 +21,18 @@ class XicidailiCrawler(BaseCrawler):
}

@logger.catch
def crawl(self):
"""
crawl main method
"""
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url, headers=self.headers)
for proxy in self.parse(html):
logger.info(f'fetched proxy {proxy.string()} from {url}')
yield proxy

async def crawl(self):
proxies = []
async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(ssl=False)) as session:
tasks = [self.fetch(session, url, headers=self.headers) for url in self.urls]
results = await asyncio.gather(*tasks)
for result in results:
if result:
for proxy in self.parse(result):
proxies.append(proxy)
return proxies

def parse(self, html):
"""
parse html file to get proxies
Expand All @@ -47,6 +51,5 @@ def parse(self, html):

if __name__ == '__main__':
crawler = XicidailiCrawler()
for proxy in crawler.crawl():
for proxy in crawler.run():
print(proxy)

2 changes: 1 addition & 1 deletion proxypool/crawlers/public/xiladaili.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ def parse(self, html):

if __name__ == '__main__':
crawler = XiladailiCrawler()
for proxy in crawler.crawl():
for proxy in crawler.run():
print(proxy)
29 changes: 16 additions & 13 deletions proxypool/crawlers/public/zhandaye.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import asyncio
import aiohttp
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from loguru import logger
import re


BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
MAX_PAGE = 5


class ZhandayeCrawler(BaseCrawler):
"""
zhandaye crawler, https://www.zdaye.com/dayProxy/
Expand All @@ -19,25 +21,27 @@ class ZhandayeCrawler(BaseCrawler):
urls = []
ignore = True

def crawl(self):
self.crawl_catalog()
yield from super().crawl()
async def crawl(self):
await self.crawl_catalog()
await super().crawl()

def crawl_catalog(self):
for url in self.urls_catalog:
logger.info(f'fetching {url}')
html = self.fetch(url, headers=self.headers)
self.parse_catalog(html)
async def crawl_catalog(self):
async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(ssl=False)) as session:
tasks = [self.fetch(session, url, headers=self.headers) for url in self.urls_catalog]
results = await asyncio.gather(*tasks)
for result in results:
if result:
self.parse_catalog(result)

def parse_catalog(self, html):
"""
parse html file to get proxies
parse catalog_html file to get urls
:return:
"""
doc = pq(html)
for item in doc('#J_posts_list .thread_item div div p a').items():
url = 'https://www.zdaye.com' + item.attr('href')
logger.info(f'get detail url: {url}')
self.urls.append(url)

def parse(self, html):
Expand All @@ -54,6 +58,5 @@ def parse(self, html):

if __name__ == '__main__':
crawler = ZhandayeCrawler()
for proxy in crawler.crawl():
for proxy in crawler.run():
print(proxy)

15 changes: 10 additions & 5 deletions proxypool/processors/getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@ class Getter(object):
"""
getter of proxypool
"""

def __init__(self):
"""
init db and crawlers
"""
self.redis = RedisClient()
self.crawlers_cls = crawlers_cls
self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]

def is_full(self):
"""
if proxypool if full
return: bool
"""
return self.redis.count() >= PROXY_NUMBER_MAX

@logger.catch
def run(self):
"""
Expand All @@ -34,8 +34,13 @@ def run(self):
return
for crawler in self.crawlers:
logger.info(f'crawler {crawler} to get proxy')
for proxy in crawler.crawl():
self.redis.add(proxy)
proxies = crawler.run()
if proxies:
for proxy in proxies:
self.redis.add(proxy)
logger.info(f'crawled {len(proxies)} proxies from {crawler}')
else:
logger.debug(f'cannot crawl proxies from {crawler}')


if __name__ == '__main__':
Expand Down
Loading