Added Ollama

raznem · Jan 22, 2025 · 6553ee5 · 6553ee5
1 parent 38762ba
commit 6553ee5
Show file tree

Hide file tree

Showing 4 changed files with 567 additions and 367 deletions.
diff --git a/docs/features/custom-models.md b/docs/features/custom-models.md
@@ -1,4 +1,6 @@
-## Run with custom model
+Note that small local models tend to trim long outputs and could require more careful tuning of data description. 
+
+## Run custom langchain OpenAI model
 
 You can instantiate `Parsera` with any chat model supported by LangChain, for example, to run the model from Azure:  
 ```python
@@ -14,16 +16,48 @@ llm = AzureChatOpenAI(
     temperature=0.0,
 )
 
-url = "https://news.ycombinator.com/"
+url = "https://github.com/raznem/parsera"
 elements = {
-    "Title": "News title",
-    "Points": "Number of points",
-    "Comments": "Number of comments",
+    "Stars": "Number of stars",
+    "Fork": "Number of forks",
 }
 scrapper = Parsera(model=llm)
 result = scrapper.run(url=url, elements=elements)
 ```
 
+## Run local model with `Ollama`
+First, you should install and run `ollama` in your local environment: [official installation guide](https://github.com/ollama/ollama?tab=readme-ov-file#ollama).
+Additionally, you need to install `langchain_ollama` with:
+```shell
+pip install langchain-ollama
+```
+
+The next step is pulling the [model](https://ollama.com/search). For example, to pull Qwen2.5 14B run:
+```shell
+ollama pull qwen2.5:14b
+```
+
+After all the setup simply run:
+```python
+from parsera import Parsera
+from langchain_ollama import ChatOllama
+
+url = "https://github.com/raznem/parsera"
+elements = {
+    "Stars": "Number of stars",
+    "Fork": "Number of forks",
+}
+
+llm = ChatOllama(
+    model="qwen2.5:14b",
+    temperature=0,
+)
+
+scrapper = Parsera(model=llm)
+result = await scrapper.arun(url=url, elements=elements)
+```
+
+
 ## Run local model with `Trasformers`
 Currently, we only support models that include a `system` token
 
@@ -38,11 +72,10 @@ from parsera.engine.model import HuggingFaceModel
 from parsera import Parsera
 
 # Define the URL and elements to scrape
-url = "https://news.ycombinator.com/"
+url = "https://github.com/raznem/parsera"
 elements = {
-"Title": "News title",
-"Points": "Number of points",
-"Comments": "Number of comments",
+    "Stars": "Number of stars",
+    "Fork": "Number of forks",
 }
 
 # Initialize model with transformers pipeline

diff --git a/parsera/engine/simple_extractor.py b/parsera/engine/simple_extractor.py
@@ -56,7 +56,8 @@ async def run(self, content: str, attributes: dict[str, str]) -> list[dict]:
 
 
 TABULAR_EXTRACTOR_SYSTEM_PROMPT = """
-Your goal is to find the elements from the webpage content and return them in json format.
+Your goal is to find the elements from the webpage content and return list of them in json format.
+Make sure to return list of all relevant elements from the page.
 For example if user asks:
 Return the following elements from the page content:
 ```
@@ -89,6 +90,7 @@ async def run(self, content: str, attributes: dict[str, str]) -> list[dict]:
     {"link": "https://example.com/link1"},
     {"link": "https://example.com/link2"},
     {"link": "https://example.com/link3"},
+    {"link": "https://example.com/link4"},
 ]
 ```
 
@@ -98,6 +100,8 @@ async def run(self, content: str, attributes: dict[str, str]) -> list[dict]:
     {"name": "name1", "price": "100"},
     {"name": "name2", "price": null},
     {"name": "name3", "price": "300"},
+    {"name": "name4", "price": "250"},
+    {"name": "name5", "price": "99"},
 ]
 ```