From 1c7b31dd074e763badc631fbd2e5b8ca9f9006e2 Mon Sep 17 00:00:00 2001 From: Soeb-aryn Date: Mon, 9 Sep 2024 13:14:40 -0700 Subject: [PATCH] improving ExtractTableProperties and standardizer transforms (#773) * ntsb transform changes * updating ntsb notebook * removing old file * lint fixes * filename fixes * linting fix * linting fix * moving prompts to a common file * lint fix --------- Co-authored-by: Soeb --- .../sycamore/llms/prompts/__init__.py | 4 + .../sycamore/llms/prompts/default_prompts.py | 33 ++ .../unit/transforms/test_standardizer.py | 8 +- .../transforms/assign_doc_properties.py | 2 +- .../transforms/extract_table_properties.py | 42 +- .../sycamore/transforms/standardizer.py | 55 ++- notebooks/ntsb-demo.ipynb | 362 ++++++------------ 7 files changed, 202 insertions(+), 304 deletions(-) diff --git a/lib/sycamore/sycamore/llms/prompts/__init__.py b/lib/sycamore/sycamore/llms/prompts/__init__.py index e9de07b52..b3a4d98c3 100644 --- a/lib/sycamore/sycamore/llms/prompts/__init__.py +++ b/lib/sycamore/sycamore/llms/prompts/__init__.py @@ -10,6 +10,8 @@ SchemaZeroShotGuidancePrompt, PropertiesZeroShotGuidancePrompt, TaskIdentifierZeroShotGuidancePrompt, + ExtractTablePropertiesTablePrompt, + ExtractTablePropertiesPrompt, ) from sycamore.llms.prompts.default_prompts import _deprecated_prompts @@ -20,6 +22,8 @@ "TextSummarizerGuidancePrompt", "SchemaZeroShotGuidancePrompt", "PropertiesZeroShotGuidancePrompt", + "ExtractTablePropertiesTablePrompt", + "ExtractTablePropertiesPrompt", ] + list(_deprecated_prompts.keys()) __all__ = prompts diff --git a/lib/sycamore/sycamore/llms/prompts/default_prompts.py b/lib/sycamore/sycamore/llms/prompts/default_prompts.py index 879d2afed..47a096bbc 100644 --- a/lib/sycamore/sycamore/llms/prompts/default_prompts.py +++ b/lib/sycamore/sycamore/llms/prompts/default_prompts.py @@ -94,6 +94,39 @@ class TaskIdentifierZeroShotGuidancePrompt(SimplePrompt): """ +class ExtractTablePropertiesPrompt(SimplePrompt): + user = """ + You are given a text string where columns are separated by comma representing either a single column, + or multi-column table each new line is a new row. + Instructions: + 1. Parse the table and return a flattened JSON object representing the key-value pairs of properties + defined in the table. + 2. Do not return nested objects, keep the dictionary only 1 level deep. The only valid value types + are numbers, strings, and lists. + 3. If you find multiple fields defined in a row, feel free to split them into separate properties. + 4. Use camelCase for the key names + 5. For fields where the values are in standard measurement units like miles, + nautical miles, knots, celsius + 6. return only the json object between ``` + - include the unit in the key name and only set the numeric value as the value. + - e.g. "Wind Speed: 9 knots" should become windSpeedInKnots: 9, + "Temperature: 3°C" should become temperatureInC: 3 + """ + + +class ExtractTablePropertiesTablePrompt(SimplePrompt): + user = """ + You are given a text string where columns are separated by comma representing either a single column, + or multi-column table each new line is a new row. + Instructions: + 1. Parse the table and make decision if key, value pair information can be extracted from it. + 2. if the table contains multiple cell value corresponding to one key, the key, value pair for such table + cant be extracted. + 3. return True if table cant be parsed as key value pair. + 4. return only True or False nothing should be added in the response. + """ + + class EntityExtractorMessagesPrompt(SimplePrompt): def __init__(self, question: str, field: str, format: Optional[str], discrete: bool = False): super().__init__() diff --git a/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py b/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py index 9450d64a7..7c8bc5c9c 100644 --- a/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py +++ b/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py @@ -18,20 +18,18 @@ def setUp(self): ) def test_datetime(self): - date_standardizer = DateTimeStandardizer() - output = StandardizeProperty( - None, standardizer=date_standardizer, path=[["properties", "entity", "dateTime"]] + None, standardizer=DateTimeStandardizer, path=[["properties", "entity", "dateTime"]] ).run(self.input) + assert "properties" in output.keys() assert "entity" in output.properties.keys() assert output.properties.get("entity")["dateTime"] == "March 17, 2023, 14:25 " assert output.properties.get("entity")["day"] == date(2023, 3, 17) def test_location(self): - loc_standardizer = LocationStandardizer() output = StandardizeProperty( - None, standardizer=loc_standardizer, path=[["properties", "entity", "location"]] + None, standardizer=LocationStandardizer, path=[["properties", "entity", "location"]] ).run(self.input) assert "properties" in output.keys() diff --git a/lib/sycamore/sycamore/transforms/assign_doc_properties.py b/lib/sycamore/sycamore/transforms/assign_doc_properties.py index 00836da9b..952196dd2 100644 --- a/lib/sycamore/sycamore/transforms/assign_doc_properties.py +++ b/lib/sycamore/sycamore/transforms/assign_doc_properties.py @@ -7,7 +7,7 @@ class AssignDocProperties(SingleThreadUser, NonGPUUser, Map): """ - The AssignDocProperties transform is used to copy properties from first element pf a specific type + The AssignDocProperties transform is used to copy properties from first element of a specific type to the parent document. This allows for the consolidation of key attributes at the document level. Args: diff --git a/lib/sycamore/sycamore/transforms/extract_table_properties.py b/lib/sycamore/sycamore/transforms/extract_table_properties.py index 2fcaadf4a..b46dbd350 100644 --- a/lib/sycamore/sycamore/transforms/extract_table_properties.py +++ b/lib/sycamore/sycamore/transforms/extract_table_properties.py @@ -7,6 +7,7 @@ import logging from sycamore.transforms.llm_query import LLMTextQueryAgent from sycamore.llms import LLM +from sycamore.llms.prompts import ExtractTablePropertiesPrompt, ExtractTablePropertiesTablePrompt class ExtractTableProperties(SingleThreadUser, NonGPUUser, Map): @@ -52,42 +53,23 @@ def extract_parent_json(input_string: str) -> str: @staticmethod @timetrace("ExtrKeyVal") - def extract_table_properties(parent: Document, property_name: str, llm: LLM) -> Document: + def extract_table_properties( + parent: Document, property_name: str, llm: LLM, prompt_find_table: str = "", prompt_LLM: str = "" + ) -> Document: """ This Method is used to extract key value pair from table using LLM and populate it as property of that element. """ - prompt = """ - You are given a text string where columns are separated by comma representing either a single column, - or multi-column table each new line is a new row. - Instructions: - 1. Parse the table and make decision if key, value pair information can be extracted from it. - 2. if the table contains multiple cell value corresponding to one key, the key, value pair for such table - cant be extracted. - 3. return True if table cant be parsed as key value pair. - 4. return only True or False nothing should be added in the response. - """ - query_agent = LLMTextQueryAgent(prompt=prompt, llm=llm, output_property="keyValueTable", element_type="table") + if prompt_find_table == "": + prompt_find_table = ExtractTablePropertiesTablePrompt().user + query_agent = LLMTextQueryAgent( + prompt=prompt_find_table, llm=llm, output_property="keyValueTable", element_type="table" + ) doc = query_agent.execute_query(parent) - prompt = """ - You are given a text string where columns are separated by comma representing either a single column, - or multi-column table each new line is a new row. - Instructions: - 1. Parse the table and return a flattened JSON object representing the key-value pairs of properties - defined in the table. - 2. Do not return nested objects, keep the dictionary only 1 level deep. The only valid value types - are numbers, strings, and lists. - 3. If you find multiple fields defined in a row, feel free to split them into separate properties. - 4. Use camelCase for the key names - 5. For fields where the values are in standard measurement units like miles, - nautical miles, knots, celsius - 6. return only the json object between ``` - - include the unit in the key name and only set the numeric value as the value. - - e.g. "Wind Speed: 9 knots" should become windSpeedInKnots: 9, - "Temperature: 3°C" should become temperatureInC: 3 - """ - query_agent = LLMTextQueryAgent(prompt=prompt, llm=llm, output_property=property_name, element_type="table") + if prompt_LLM == "": + prompt_LLM = ExtractTablePropertiesPrompt().user + query_agent = LLMTextQueryAgent(prompt=prompt_LLM, llm=llm, output_property=property_name, element_type="table") doc = query_agent.execute_query(parent) for ele in doc.elements: diff --git a/lib/sycamore/sycamore/transforms/standardizer.py b/lib/sycamore/sycamore/transforms/standardizer.py index 068036c26..9de93fd6e 100644 --- a/lib/sycamore/sycamore/transforms/standardizer.py +++ b/lib/sycamore/sycamore/transforms/standardizer.py @@ -31,9 +31,10 @@ def fixer(self, text: str) -> Union[str, Tuple[str, date]]: """ pass + @abstractmethod def standardize(self, doc: Document, key_path: List[str]) -> Document: """ - Applies the fixer method to a specific field in the document as defined by the key_path. + Abstract method applies the fixer method to a specific field in the document as defined by the key_path. Args: doc (Document): The document to be standardized. @@ -45,18 +46,7 @@ def standardize(self, doc: Document, key_path: List[str]) -> Document: Raises: KeyError: If any of the keys in key_path are not found in the document. """ - current = doc - for key in key_path[:-1]: - if current.get(key, None): - current = current[key] - else: - raise KeyError(f"Key {key} not found in the dictionary among {current.keys()}") - target_key = key_path[-1] - if current.get(target_key, None): - current[target_key] = self.fixer(current[target_key]) - else: - raise KeyError(f"Key {target_key} not found in the dictionary among {current.keys()}") - return doc + pass class LocationStandardizer(Standardizer): @@ -118,7 +108,8 @@ class LocationStandardizer(Standardizer): "WY": "Wyoming", } - def fixer(self, text: str) -> str: + @staticmethod + def fixer(text: str) -> str: """ Replaces any US state abbreviations in the text with their full state names. @@ -135,13 +126,42 @@ def replacer(match): return re.sub(r"\b[A-Z]{2}\b", replacer, text) + @staticmethod + def standardize(doc: Document, key_path: List[str]) -> Document: + """ + Applies the fixer method to a specific field in the document as defined by the key_path. + + Args: + doc (Document): The document to be standardized. + key_path (List[str]): The path to the field within the document that should be standardized. + + Returns: + Document: The document with the standardized field. + + Raises: + KeyError: If any of the keys in key_path are not found in the document. + """ + current = doc + for key in key_path[:-1]: + if current.get(key, None): + current = current[key] + else: + raise KeyError(f"Key {key} not found in the dictionary among {current.keys()}") + target_key = key_path[-1] + if current.get(target_key, None): + current[target_key] = LocationStandardizer.fixer(current[target_key]) + else: + raise KeyError(f"Key {target_key} not found in the dictionary among {current.keys()}") + return doc + class DateTimeStandardizer(Standardizer): """ A standardizer for transforming date and time strings into a consistent format. """ - def fixer(self, raw_dateTime: str) -> Tuple[str, date]: + @staticmethod + def fixer(raw_dateTime: str) -> Tuple[str, date]: """ Converts a date-time string by replacing periods with colons and parsing it into a date object. @@ -175,7 +195,8 @@ def fixer(self, raw_dateTime: str) -> Tuple[str, date]: # Handle any other exceptions raise RuntimeError(f"Unexpected error occurred while processing: {raw_dateTime}") from e - def standardize(self, doc: Document, key_path: List[str]) -> Document: + @staticmethod + def standardize(doc: Document, key_path: List[str]) -> Document: """ Applies the fixer method to a specific date-time field in the document as defined by the key_path, and adds an additional "day" field with the extracted date. @@ -199,7 +220,7 @@ def standardize(self, doc: Document, key_path: List[str]) -> Document: raise KeyError(f"Key {key} not found in the dictionary among {current.keys()}") target_key = key_path[-1] if target_key in current.keys(): - current[target_key], current["day"] = self.fixer(current[target_key]) + current[target_key], current["day"] = DateTimeStandardizer.fixer(current[target_key]) else: raise KeyError(f"Key {target_key} not found in the dictionary among {current.keys()}") return doc diff --git a/notebooks/ntsb-demo.ipynb b/notebooks/ntsb-demo.ipynb index 0dcb97513..2177931d2 100644 --- a/notebooks/ntsb-demo.ipynb +++ b/notebooks/ntsb-demo.ipynb @@ -2,26 +2,20 @@ "cells": [ { "cell_type": "markdown", - "id": "cca08214-df54-42d3-ba80-4f73bec83ff6", + "id": "35d91fa2-a4e2-4651-b12d-10e9cbdcf286", "metadata": {}, "source": [ "# NTSB demo\n", "\n", "Sycamore is a **Document Processing System** = **ETL** + **Query Planning/Retrieval**.\n", "\n", - "https://aryn-public.s3.amazonaws.com/ntsb/59.pdf\n", + "https://aryn-public.s3.amazonaws.com/ntsb/22.pdf\n", "\n", "**ETL steps**:\n", - "- extracts Text + Structure:\n", - " - titles,\n", - " - section headers,\n", - " - text paragraphs,\n", - " - figures,\n", - " - tables and their cells\n", - " - captions\n", - " - page headers and footers\n", - " - footnotes\n", - "- it then can merge individual chunks into larger chunks that preserve more of the context\n", + "- extract Text + Structure: titles, section headers, text paragraphs, figures, tables and their cells, captions, page headers and footers, footnotes\n", + "- summarize images\n", + "- extract data from tables\n", + "- standardize locations and dates\n", "\n", "**Query Planning/Retrieval**:\n", "Having these structural elements and information extracted enables the consumer of these document to have much better control over what is being embedded and how, leading to better retrieval in a RAG system.\n", @@ -30,23 +24,19 @@ "**Question answering**:\n", "Sycamore can also extract information from a document. We’ll see how it can extract location and dates from NTSB docs, but also specific elements like aircraft type. \n", "\n", - "\n", - "\n", "Once the data is transformed, we can ask questions on it." ] }, { "cell_type": "code", "execution_count": null, - "id": "f26675d3-14c2-4b75-bfc7-c390ab16f5c8", + "id": "d6a4aca2-4e57-4dfa-8b17-f8bf59c8bea2", "metadata": {}, "outputs": [], "source": [ "import sycamore \n", - "\n", "from sycamore.transforms.partition import ArynPartitioner\n", "from sycamore.transforms.summarize_images import SummarizeImages\n", - "\n", "from sycamore.transforms import (AssignDocProperties, \n", " ExtractTableProperties, \n", " StandardizeProperty, \n", @@ -54,101 +44,49 @@ " DateTimeStandardizer)\n", "\n", "from sycamore.llms import OpenAI \n", - "\n", "from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH\n", - "\n", "from sycamore.utils.pdf_utils import show_pages, enumerate_images_and_tables, display_page_and_table_properties\n", - "\n", - "import json" + "from sycamore.materialize import MaterializeSourceMode \n", + "import json\n", + "from pathlib import Path" ] }, { "cell_type": "code", "execution_count": null, - "id": "f6f7bcf2-7ddb-43fd-bd42-6977e0e3de74", + "id": "e83894dd-b7ec-4dea-be19-98e6c22b832b", "metadata": {}, "outputs": [], "source": [ - "from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH\n", "assert ArynConfig.get_aryn_api_key() != \"\", f\"Unable to find aryn API key. Looked in {_DEFAULT_PATH}\"" ] }, - { - "cell_type": "markdown", - "id": "a8ada540-870e-4e2b-a4a7-ac608ab4df7a", - "metadata": {}, - "source": [ - "### Initialize Sycamore" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "d5db14c5-d7aa-4074-90d8-3cca9988e2e0", - "metadata": { - "scrolled": true - }, + "id": "0b1c192e-4b4f-4b20-8785-5f93fc2524a0", + "metadata": {}, "outputs": [], "source": [ "ctx = sycamore.init()" ] }, - { - "cell_type": "markdown", - "id": "65358d39-1684-46e5-957d-5156f147c71b", - "metadata": {}, - "source": [ - "## Load the data\n", - "\n", - "We're loading NTSB incident reports (pdf documents describing aviation incidents). \n", - "The documents consist of a combination of tables, text, and figures.\n", - "\n", - "We’re loading the data from a public s3 bucket\n", - "\n", - "Sample document: \n", - "https://aryn-public.s3.amazonaws.com/ntsb/59.pdf" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "713ef530-7a1f-482f-a6f6-c072f27624e4", + "id": "81327aa6-010d-44ad-a5ce-1be88384af66", "metadata": {}, "outputs": [], "source": [ - "s3_path = \"s3://aryn-public/ntsb/59.pdf\"\n", - "llm = OpenAI('gpt-4o-mini')" - ] - }, - { - "cell_type": "markdown", - "id": "3b2c8f33-a26f-41c6-9d0d-e06ba8f76c4c", - "metadata": {}, - "source": [ - "## Define a pipeline using SycamorePartitioner" - ] - }, - { - "cell_type": "markdown", - "id": "c3b1c405-4868-4f11-8baa-8784e18da77e", - "metadata": {}, - "source": [ - "### Chunk and extract document structure\n", - "Extracts Text + Structure:\n", - "- titles,\n", - "- section headers,\n", - "- text paragraphs,\n", - "- figures,\n", - "- tables and their cells\n", - "- captions\n", - "- page headers and footers\n", - "- footnotes" + "s3_path = \"s3://aryn-public/ntsb/22.pdf\"\n", + "llm = OpenAI(\"gpt-4o-mini\")\n", + "materialize_dir = Path.cwd() / \"cache\"\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "5e65b7ca-b915-4b3c-b91c-1ffeae0e04a7", + "id": "bc3d902c-1f22-419c-bb50-a79a07bf65df", "metadata": {}, "outputs": [], "source": [ @@ -165,155 +103,153 @@ " \n", " # Summarize each image element\n", " .transform(SummarizeImages)\n", - ")\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "17ab5ed1-ecdc-437c-94fb-9f467446c410", - "metadata": {}, - "source": [ - "## Visualize partitioned documents" + "\n", + " # Materialize each document\n", + " .materialize(\n", + " path=f\"{materialize_dir}/docset_summarized\",\n", + " source_mode=MaterializeSourceMode.IF_PRESENT)\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "6e426c05-aa66-42fa-930b-2b407ce7a75c", + "id": "aea93254-9116-4065-a411-338cfa658af2", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "sample_pages = show_pages(docset, limit=4)\n" + "show_pages(docset, limit=25)" ] }, { "cell_type": "code", "execution_count": null, - "id": "f249d7d7-d7af-441b-83f6-1d113ad61b0b", - "metadata": { - "scrolled": true - }, + "id": "e5612962-1953-4494-bb22-e9d63ee4fb04", + "metadata": {}, "outputs": [], "source": [ - "# Show text representation of images and tables\n", - "enumerate_images_and_tables(docset.take(1))" + "enumerate_images_and_tables(docset.take_all())" ] }, { "cell_type": "code", "execution_count": null, - "id": "5919b73b-5f84-429b-98a7-5633406cca3b", + "id": "68a4487b-14c6-49a0-88c9-2369d2b59590", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "# extract properties from tables as key value pair \n", - "ds_extracted = docset.transform(cls = ExtractTableProperties,parameters = ['llm_response', llm])\n", - "ds_extracted.show(limit=1)" + "prompt_LLM = \"\"\"\n", + " You are given a text string where columns are separated by comma representing either a single column, \n", + " or multi-column table each new line is a new row.\n", + " Instructions:\n", + " 1. Parse the table and return a flattened JSON object representing the key-value pairs of properties \n", + " defined in the table.\n", + " 2. Do not return nested objects, keep the dictionary only 1 level deep. The only valid value types \n", + " are numbers, strings, and lists.\n", + " 3. If you find multiple fields defined in a row, feel free to split them into separate properties.\n", + " 4. Use camelCase for the key names\n", + " 5. For fields where the values are in standard measurement units like miles, \n", + " nautical miles, knots, celsius\n", + " 6. return only the json object between ``` \n", + " - include the unit in the key name and only set the numeric value as the value.\n", + " - e.g. \"Wind Speed: 9 knots\" should become windSpeedInKnots: 9, \n", + " \"Temperature: 3°C\" should become temperatureInC: 3\n", + " \"\"\"\n", + "\n", + "\n", + "\n", + "ds_extracted = (\n", + " docset\n", + " # Extract properties from tables and save it as key value pair in respective table elements\n", + " .map( lambda doc: ExtractTableProperties.extract_table_properties( doc, property_name = \"table_props\", llm =llm, prompt_LLM=prompt_LLM)) \n", + "\n", + " # Materialize document\n", + " .materialize(\n", + " path = f\"{materialize_dir}/docset_extrTblprop\",\n", + " source_mode=MaterializeSourceMode.IF_PRESENT)\n", + ")\n", + "\n", + "ds_extracted.show(limit=6)\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "17de9aa0-b6de-466c-9901-ce93cf126565", + "id": "e15c60f4-a451-465a-adc7-f3a8ba9fa0ff", "metadata": {}, "outputs": [], "source": [ - "# AssignDocProperties is used to copy properties from first element of table to the document\n", - "element_type = 'table'\n", - "property_name = 'llm_response'\n", - "\n", - "ds_prop1 = ds_extracted.transform(cls = AssignDocProperties, parameters = [element_type, property_name])\n", - "ds_prop1.show(limit=1, show_elements = False)" - ] - }, - { - "cell_type": "markdown", - "id": "ae7ca5fc-b57b-4b2c-a7ba-1d53f03e66a3", - "metadata": {}, - "source": [ - "## Inspect extracted information\n", - "\n", - "## Notice that dates and locations have been standardized" + "# we assign properties from 1st table element to document level \n", + "ds_prop1 = (ds_extracted.\n", + " map( lambda doc : AssignDocProperties.assign_doc_properties( doc, element_type=\"table\", property_name = \"table_props\")))\n", + "ds_prop1.show(limit=6, show_elements = False)" ] }, { "cell_type": "code", "execution_count": null, - "id": "fb6933bb-92a0-43be-98de-0933c338fc1d", + "id": "0e688d84-376b-4a17-b4e7-230455ce1282", "metadata": {}, "outputs": [], "source": [ + "\n", "# We noramalize the date and location using LocationStandardizer and DateTimeStandardizer transform\n", - "loc_standardizer = LocationStandardizer()\n", - "date_standardizer = DateTimeStandardizer()\n", "\n", - "ds_normd = ds_prop1.transform(cls = StandardizeProperty, standardizer=loc_standardizer, path=[['properties','entity','location']])\n", - "ds_normd = ds_prop1.transform(cls = StandardizeProperty, standardizer=date_standardizer, path=[['properties','entity','dateTime']])\n", "\n", - "ds_normd.show(limit=1, show_elements = False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bab0e9a2-7f24-4fd5-ad15-6b23ac056218", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "show_pages(ds_normd, limit=1)" + "ds_normd = (\n", + " ds_prop1\n", + " \n", + " # Converts state abbreviations to their full names.\n", + " .map( lambda doc: LocationStandardizer.standardize(doc, key_path = [\"properties\",\"entity\",\"location\"]))\n", + "\n", + " # Converts datetime into a common format\n", + " .map( lambda doc: DateTimeStandardizer.standardize(doc, key_path = [\"properties\",\"entity\",\"dateTime\"]))\n", + "\n", + " # Materialize document\n", + " .materialize(\n", + " path=f\"{materialize_dir}/docset_normalized\",\n", + " source_mode=MaterializeSourceMode.IF_PRESENT)\n", + ")\n", + "ds_normd.show(limit=6, show_elements = False)\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "126182dc-35dd-483a-9236-42b5b48cd6ac", - "metadata": { - "scrolled": true - }, + "id": "fdb7d396-a46f-4a83-89ca-22a622a26c17", + "metadata": {}, "outputs": [], "source": [ "# Show tables and their conversion to properties\n", - "display_page_and_table_properties(ds_normd.take(1))" + "display_page_and_table_properties(ds_normd.take())" ] }, { "cell_type": "code", "execution_count": null, - "id": "4609139a-bb39-452a-8c93-29408d375e64", + "id": "a9679f78-eff0-40ef-b17f-ba027189914d", "metadata": { "scrolled": true }, "outputs": [], "source": [ + "## if removed, remove json import up top\n", "from IPython.display import display, HTML\n", "for e in ds_normd.take_all()[0].elements:\n", - " if \"table\" in e.type:\n", + " if \"table\" in e.type and e.table is not None :\n", " print(\"Element Type: \", e.type)\n", " print(\"Element Properties: \", json.dumps(e.properties, indent=2, default=str))\n", " display(HTML(e.table.to_html()))" ] }, - { - "cell_type": "markdown", - "id": "98244b4d-2bd3-48a9-8768-9e17fd708fd6", - "metadata": {}, - "source": [ - "## Indexing the documents for retrieval\n", - "\n", - "Now that we have extracted the text, partitioned it, labeled the partitions, extracted information and standardized it, we're ready to store it for retrieval" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "b9101a62-507c-4784-b9f7-581580fd2a35", + "id": "3ae51ea5-fb84-48f9-995e-cfa1779855a4", "metadata": {}, "outputs": [], "source": [ @@ -325,7 +261,7 @@ " opensearch_host = \"localhost\"\n", " print(\"Assuming we are running outside of a container, using localhost for OpenSearch host\")\n", "\n", - "index = \"ntsb-bb-2_demo\"\n", + "index = \"ntsb-demo-all\"\n", "os_client_args = {\n", " \"hosts\": [{\"host\": \"localhost\", \"port\": 9200}],\n", " \"http_compress\": True,\n", @@ -362,18 +298,10 @@ "}" ] }, - { - "cell_type": "markdown", - "id": "a4669cc4-be71-463a-8eb2-6474c2f4a24a", - "metadata": {}, - "source": [ - "### Creating embeddings" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "eabce93b-2767-45c6-ac73-4a8ef491de16", + "id": "1fe68c03-b48b-4e8c-bd00-a98cab17f4b6", "metadata": {}, "outputs": [], "source": [ @@ -383,23 +311,15 @@ " ds_normd\n", " .spread_properties([\"entity\", \"path\"])\n", " .explode()\n", - " .sketch()\n", - " .embed(embedder=SentenceTransformerEmbedder(batch_size=100, model_name=\"sentence-transformers/all-MiniLM-L6-v2\"), num_gpus=0.1)\n", + " .embed(embedder=SentenceTransformerEmbedder(batch_size=100, model_name=\"sentence-transformers/all-MiniLM-L6-v2\"))\n", + " \n", ")" ] }, - { - "cell_type": "markdown", - "id": "649ab324-104b-4230-af81-f4aa5399da05", - "metadata": {}, - "source": [ - "### Write the OpenSearch index" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "b87a4be8-162d-43a4-a4cf-fde9a91bbb4a", + "id": "02abe9f4-77c5-464d-9402-3e217a5d6cfa", "metadata": {}, "outputs": [], "source": [ @@ -411,26 +331,10 @@ ")" ] }, - { - "cell_type": "markdown", - "id": "b719ebaa-321c-4ff7-ace9-1be0d39ac9be", - "metadata": {}, - "source": [ - "## Answer some questions" - ] - }, - { - "cell_type": "markdown", - "id": "7561fa59-a7a5-47bb-bc88-c1f4a7900f55", - "metadata": {}, - "source": [ - "### Create a connection to OpenSearch" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "5842651f-dbb1-4a62-acb0-b5febc30046d", + "id": "9c4ea2d6-6be0-4f71-b103-d10cd2aad701", "metadata": {}, "outputs": [], "source": [ @@ -456,36 +360,20 @@ "osq = OpenSearchQueryExecutor(os_client_args)" ] }, - { - "cell_type": "markdown", - "id": "229a7d7c-9a69-4711-8c51-08a7f966b419", - "metadata": {}, - "source": [ - "### Question" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "0c07a7fb-8125-4e00-a891-63b433f5d4f0", + "id": "49f0fcde-8351-4b3c-8d9e-8c17b5b5342b", "metadata": {}, "outputs": [], "source": [ - "question = \"Were there any incidents involving red planes\"" - ] - }, - { - "cell_type": "markdown", - "id": "70d7dbc8-b99c-4d22-add4-2a964e65e5ac", - "metadata": {}, - "source": [ - "### Text query" + "question = \"Were there any incidents involving Cirrus airplanes\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "ec38c1d8-5014-4105-b5bc-f218882cc6ce", + "id": "0beefa5e-b497-4a0f-9952-90055e864ab5", "metadata": {}, "outputs": [], "source": [ @@ -502,25 +390,17 @@ { "cell_type": "code", "execution_count": null, - "id": "89903619-a1ba-47d2-8089-6779f785e193", + "id": "f51ca656-68fc-4ee5-9a3e-81811feda2bf", "metadata": {}, "outputs": [], "source": [ "result" ] }, - { - "cell_type": "markdown", - "id": "326a9029-7eb1-48a9-a31b-e7e7c7fe42f0", - "metadata": {}, - "source": [ - "### RAG query" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "f7737adc-e1a5-493a-a0c6-5cbffb15de3a", + "id": "af60fd78-cfda-475b-b3ba-55256c8771fa", "metadata": {}, "outputs": [], "source": [ @@ -549,7 +429,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7a4599fc-245c-45ac-a960-d238910f19c6", + "id": "61a10458-097c-4b6e-8519-9911923b16f2", "metadata": {}, "outputs": [], "source": [ @@ -598,7 +478,7 @@ { "cell_type": "code", "execution_count": null, - "id": "95caade8-09d8-4a85-8029-78ebb37a82ef", + "id": "22f9aeb9-7907-4aea-979a-0a34f41d0dbb", "metadata": {}, "outputs": [], "source": [ @@ -608,39 +488,19 @@ { "cell_type": "code", "execution_count": null, - "id": "0d53cb61-f133-4a70-b2f6-7f8d1eec4715", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "rag_result" - ] - }, - { - "cell_type": "markdown", - "id": "f15de938-7f93-4e23-804c-efefc70ecdaa", - "metadata": {}, - "source": [ - "### Filtered RAG query" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1621363-bc1e-4a0b-85c2-3b871f3ef4a2", + "id": "686ef630-015a-45ee-b270-35eb769b13cc", "metadata": {}, "outputs": [], "source": [ "filtered_rag_query = OpenSearchQuery()\n", "filtered_rag_query[\"index\"] = index\n", - "question = \"Were there any airplace incidents in Texas in January 14, 2023?\"\n", + "question = \"Were there any airplace incidents in Indiana on January 24, 2023?\"\n", "filter = {\n", " \"bool\": {\n", " \"must\": [\n", " {\n", " \"match_phrase\": {\n", - " \"properties.entity.location\": \"Texas\"\n", + " \"properties.entity.location\": \"Indiana\"\n", " }\n", " }\n", " \n", @@ -697,7 +557,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6d04afa5-7380-4265-a250-c6c9559e394b", + "id": "f07ac2f7-030c-425f-a46b-cb338cd85628", "metadata": {}, "outputs": [], "source": [ @@ -707,7 +567,7 @@ { "cell_type": "code", "execution_count": null, - "id": "adfb56bc-e6a4-4f46-a5cd-4e79b8291675", + "id": "fb8528fa-b9f9-4bdf-8af0-825d11bfeff5", "metadata": {}, "outputs": [], "source": [ @@ -717,7 +577,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ffb8b670-ac29-4bf7-ac5b-0734dcd7abad", + "id": "45b6c6a4-144e-4823-b34b-139e3fb85b93", "metadata": {}, "outputs": [], "source": [] @@ -739,7 +599,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.10.14" } }, "nbformat": 4,