Skip to content

Commit

Permalink
PageLoader class implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
raznem committed Aug 27, 2024
1 parent 7004046 commit 7936e8e
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 15 deletions.
27 changes: 18 additions & 9 deletions parsera/main.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,39 @@
import asyncio
from langchain_core.language_models import BaseChatModel
import enum

from langchain_core.language_models import BaseChatModel

from parsera.engine.model import GPT4oMiniModel
from parsera.engine.simple_extractor import TabularExtractor, ListExtractor, ItemExtractor
from parsera.page import fetch_page_content
from parsera.engine.simple_extractor import (
ItemExtractor,
ListExtractor,
TabularExtractor,
)
from parsera.page import PageLoader, fetch_page_content


class Parsera:
class ExtractorType(enum.Enum):
LIST = ListExtractor
TABULAR = TabularExtractor
ITEM = ItemExtractor

def __init__(self, model: BaseChatModel | None = None, extractor: ExtractorType = ExtractorType.TABULAR):
def __init__(
self,
model: BaseChatModel | None = None,
extractor: ExtractorType = ExtractorType.TABULAR,
):
if model is None:
self.model = GPT4oMiniModel()
else:
self.model = model
self.extractor = extractor
self.loader = PageLoader()

async def _run(
self, url: str, elements: dict, proxy_settings: dict | None = None
) -> dict:
if proxy_settings:
content = await fetch_page_content(url=url, proxy_settings=proxy_settings)
else:
content = await fetch_page_content(url=url)
content = await self.loader.load_content(url=url, proxy_settings=proxy_settings)
extractor_instance = self.extractor.value(
elements=elements, model=self.model, content=content
)
Expand All @@ -41,4 +50,4 @@ async def arun(
) -> dict:
return await self._run(
url=url, elements=elements, proxy_settings=proxy_settings
)
)
71 changes: 66 additions & 5 deletions parsera/page.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
from typing import TypedDict
import warnings
from typing import Literal, TypedDict

from playwright.async_api import async_playwright
from playwright.async_api import (
Browser,
BrowserContext,
Page,
Playwright,
async_playwright,
)
from playwright_stealth import stealth_async


Expand All @@ -11,19 +18,73 @@ class ProxySettings(TypedDict, total=False):
password: str | None = None


class PageLoader:
def __init__(
self,
browser: Literal["firefox", "chromium"] = "firefox",
):
self._browser_id = browser
self.playwright: Playwright | None = None
self.browser: Browser | None = None
self.context: BrowserContext | None = None
self.page: Page | None = None

async def new_browser(self) -> None:
if not self.playwright:
self.playwright = await async_playwright().start()

if self.browser:
await self.browser.close()

if self._browser_id == "firefox":
self.browser = await self.playwright.firefox.launch(headless=True)
else:
self.browser = await self.playwright.chromium.launch(headless=True)

async def load_content(
self,
url: str,
proxy_settings: ProxySettings | None = None,
new_browser: bool = True,
load_state: Literal[
"domcontentloaded", "load", "networkidle"
] = "domcontentloaded",
) -> None:
if new_browser:
await self.new_browser()
self.context = await self.browser.new_context(proxy=proxy_settings)

self.page = await self.context.new_page()
await stealth_async(self.page)
# Navigate to the URL
# await page.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) # Can speed up requests
await self.page.goto(url)
await self.page.wait_for_load_state(load_state)
return await self.page.content()

async def close(self) -> None:
if self.playwright:
await self.browser.close()
self.playwright.stop()


async def fetch_page_content(
url: str,
proxy_settings: ProxySettings | None = None,
browser: str = "firefox",
) -> str:
warnings.warn(
"fetch_page_content is deprecated and will be removed",
DeprecationWarning,
)
async with async_playwright() as p:
# Launch the browser
if browser == "firefox":
browser = await p.firefox.launch(headless=True, proxy=proxy_settings)
browser = await p.firefox.launch(headless=True)
else:
browser = await p.chromium.launch(headless=True, proxy=proxy_settings)
browser = await p.chromium.launch(headless=True)
# Open a new browser context
context = await browser.new_context()
context = await browser.new_context(proxy=proxy_settings)
# Open a new page
page = await context.new_page()
await stealth_async(page)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "parsera"
version = "0.1.4"
version = "0.1.5"
description = "Lightweight library for scraping web-sites with LLMs"
authors = ["Mikhail Zanka <[email protected]>"]
license = "GPL-2.0-or-later"
Expand Down

0 comments on commit 7936e8e

Please sign in to comment.