switch to search engine

AntonKluge · Jul 6, 2024 · a78c7c2 · a78c7c2
1 parent 0625f8f
commit a78c7c2
Show file tree

Hide file tree

Showing 7 changed files with 44 additions and 37 deletions.
diff --git a/lens-gpt-backend/lens_gpt_backend/processing.py b/lens-gpt-backend/lens_gpt_backend/processing.py
@@ -24,7 +24,7 @@ def process_async(file_hash: str, init: Callable[[Producer], None]) -> None:
 
 def _processing_hierarchy(file_hash: str) -> Producer:
 
-    lens_links = LensLinksProducer(file_hash)
+    lens_links = LensLinksProducer(file_hash, add_queue=False)
     model_producer = ModelProducerProducer(file_hash)
     model_information_producer = ModelInformationProducer(file_hash)
     price_producer = PriceProducer(file_hash)

diff --git a/lens-gpt-backend/lens_gpt_backend/producer/model_information_producer.py b/lens-gpt-backend/lens_gpt_backend/producer/model_information_producer.py
@@ -70,7 +70,7 @@
 class ModelInformationProducer(Producer):
 
     def _produce(self, input_value: Product) -> tuple[Product, bool]:
-        producer_url = input_value.get_str()
+        producer_url = input_value.get_dict_str_str()["link"]
         scape_function = partial(_get_product_info, producer_url)
         result = driver_pool.execute(scape_function)
         return result, True

diff --git a/lens-gpt-backend/lens_gpt_backend/producer/model_producer.py b/lens-gpt-backend/lens_gpt_backend/producer/model_producer.py
@@ -20,7 +20,7 @@
                   "メルカリ\nPatagonia Men's P-6 Logo Long-Sleeve Responsibili-Tee | "
                   "Patagonia long sleeve, Tees, Patagonia")
 
-EXAMPLE_ANSWERS = "producer: Patagonia\nmodel: Patagonia Men's Long-Sleeved P-6 Logo Responsibili-Tee"
+EXAMPLE_ANSWERS = "producer: Patagonia\nmodel: Men's Long-Sleeved P-6 Logo Responsibili-Tee"
 
 
 class ModelProducerProducer(Producer):

diff --git a/lens-gpt-backend/lens_gpt_backend/producer/producer.py b/lens-gpt-backend/lens_gpt_backend/producer/producer.py
@@ -12,6 +12,8 @@ def __init__(self, file_hash: str, add_queue: bool = True) -> None:
 
         if add_queue:
             self._result_queue = ResultQueue.factory(file_hash)
+        else:
+            self._result_queue = None
 
         print(f"Producer[{file_hash}]: created")
 

diff --git a/lens-gpt-backend/lens_gpt_backend/producer/producer_website.py b/lens-gpt-backend/lens_gpt_backend/producer/producer_website.py
@@ -8,6 +8,7 @@
 from lens_gpt_backend.producer.producer import Producer
 from lens_gpt_backend.utils.chat_gpt import ask_chat_gpt
 from lens_gpt_backend.utils.driver_pool import driver_pool
+from lens_gpt_backend.utils.google_search import google_search
 from lens_gpt_backend.utils.product import Product
 from lens_gpt_backend.utils.utils import distinct
 
@@ -30,36 +31,18 @@
 class ProducerWebsite(Producer):
 
     def _produce(self, input_value: Product) -> tuple[Product, bool]:
-        base_url = "https://google.com/"
-        search_dict = input_value.get_dict_str_str()
-        search_term = f"{search_dict['producer']} {search_dict['model']}"
-        scape_function = partial(_get_urls_for_image, search_term)
-        result = driver_pool.execute(scape_function, base_url)
-        return result, True
-
-
-def build_google_search_url(query: str) -> str:
-    base_url = "https://www.google.com/search?q="
-    encoded_query = quote_plus(query)
-    return base_url + encoded_query
 
+        search_dict = input_value.get_dict_str_str()
+        search = search_dict["producer"] + " " + search_dict["model"]
+        search_results = google_search(search)
 
-def _get_urls_for_image(search: str, driver: WebDriver, wait: WebDriverWait[WebDriver]) -> Product:
-    search_format = build_google_search_url(search)
-    driver.get(search_format)
-
-    # Get all links from the search by getting all a tags from the center column with id center_col
-    links = driver.find_elements(By.CSS_SELECTOR, "#res a")
-    # Quick and dirty, filter all out which are not very wide, as they are probably ads
-    links = [link for link in links if link.size["width"] > 250]
-    urls = [link.get_attribute("href") for link in links]
-    non_google_urls = distinct([url for url in urls if url and "google.com" not in url])
-    enumerate_urls = [f"{i + 1}. {url}" for i, url in enumerate(non_google_urls)]
-    input_urls = "\n".join(enumerate_urls[:7])
-    prompt = f"Product: {search}\n{input_urls}"
-    response = ask_chat_gpt(ASSISTANT_INSTR, [EXAMPLE_TITLES, EXAMPLE_ANSWERS, prompt])
+        input_urls = [f"{i + 1}. {result['link']}" for i, result in enumerate(search_results)]
+        prompt = f"Product: {search}\n{input_urls}"
 
-    if response:
-        return Product(non_google_urls[int(response) - 1], data_description="url")
+        for i in range(3):
+            response = ask_chat_gpt(ASSISTANT_INSTR, [EXAMPLE_TITLES, EXAMPLE_ANSWERS, prompt])
+            if response.isnumeric() and 1 <= int(response) <= 7:
+                return Product(search_results[int(response) - 1], data_description="url"), True  # type: ignore
+            print("Invalid response, please try again: " + response + "\n" + prompt)
 
-    raise ValueError("No response from AI model!")
+        raise ValueError("No response from AI model!")
diff --git a/lens-gpt-backend/lens_gpt_backend/utils/google_search.py b/lens-gpt-backend/lens_gpt_backend/utils/google_search.py
@@ -0,0 +1,15 @@
+import requests
+
+
+def google_search(query: str) -> list[dict[str, str | None]]:
+    url = "https://www.googleapis.com/customsearch/v1"
+    params = {
+        "key": "AIzaSyA3oA_T8h6RORQZe-3wmHGVUDIFXFm42fQ",
+        "cx": "23b9ec7abc764401d",
+        "q": query
+    }
+
+    response = requests.get(url, params=params)
+    results = response.json()
+
+    return [{"title": item["title"], "link": item["link"]} for item in results["items"]]
diff --git a/lens-gpt-backend/templates/index.html b/lens-gpt-backend/templates/index.html
@@ -23,8 +23,6 @@ <h2>Results:</h2>
             return;
         }
 
-
-
         const file = fileInput.files[0];
         const formData = new FormData();
         formData.append('file', file);
@@ -57,9 +55,18 @@ <h2>Results:</h2>
                         timestamp.textContent = new Date().toISOString();
                         document.getElementById('results').appendChild(timestamp);
 
-                        // Append data
-                        const newElement = document.createElement('div');
-                        newElement.textContent = part.replace(/^data: /, '');
+                        // Format JSON data
+                        let formattedData;
+                        try {
+                            const jsonData = JSON.parse(part.replace(/^data: /, ''));
+                            formattedData = JSON.stringify(jsonData, null, 2);
+                        } catch (e) {
+                            formattedData = part.replace(/^data: /, '');
+                        }
+
+                        // Append formatted data
+                        const newElement = document.createElement('pre');
+                        newElement.textContent = formattedData;
                         document.getElementById('results').appendChild(newElement);
 
                         // Append divider