Skip to content

Commit

Permalink
Proxy support and firefox as default engine
Browse files Browse the repository at this point in the history
  • Loading branch information
raznem committed Aug 16, 2024
1 parent 9deb134 commit 1b594ae
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 9 deletions.
23 changes: 17 additions & 6 deletions parsera/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,27 @@ def __init__(self, model: BaseChatModel | None = None):
else:
self.model = model

async def _run(self, url: str, elements: dict) -> dict:
content = await fetch_page_content(url=url)
async def _run(
self, url: str, elements: dict, proxy_settings: dict | None = None
) -> dict:
if proxy_settings:
content = await fetch_page_content(url=url, proxy_settings=proxy_settings)
else:
content = await fetch_page_content(url=url)
extractor = TabularExtractor(
elements=elements, model=self.model, content=content
)
result = await extractor.run()
return result

def run(self, url: str, elements: dict) -> dict:
return asyncio.run(self._run(url=url, elements=elements))
def run(self, url: str, elements: dict, proxy_settings: dict | None = None) -> dict:
return asyncio.run(
self._run(url=url, elements=elements, proxy_settings=proxy_settings)
)

async def arun(self, url: str, elements: dict) -> dict:
return await self._run(url=url, elements=elements)
async def arun(
self, url: str, elements: dict, proxy_settings: dict | None = None
) -> dict:
return await self._run(
url=url, elements=elements, proxy_settings=proxy_settings
)
20 changes: 18 additions & 2 deletions parsera/page.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,27 @@
from typing import TypedDict

from playwright.async_api import async_playwright
from playwright_stealth import stealth_async


async def fetch_page_content(url: str) -> str:
class ProxySettings(TypedDict, total=False):
server: str
bypass: str | None = None
username: str | None = None
password: str | None = None


async def fetch_page_content(
url: str,
proxy_settings: ProxySettings | None = None,
browser: str = "firefox",
) -> str:
async with async_playwright() as p:
# Launch the browser
browser = await p.chromium.launch(headless=True)
if browser == "firefox":
browser = await p.firefox.launch(headless=True, proxy=proxy_settings)
else:
browser = await p.chromium.launch(headless=True, proxy=proxy_settings)
# Open a new browser context
context = await browser.new_context()
# Open a new page
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "parsera"
version = "0.1.2"
version = "0.1.3"
description = "Lightweight library for scraping web-sites with LLMs"
authors = ["Mikhail Zanka <[email protected]>"]
license = "GPL-2.0-or-later"
Expand Down

0 comments on commit 1b594ae

Please sign in to comment.