From 1c7b31dd074e763badc631fbd2e5b8ca9f9006e2 Mon Sep 17 00:00:00 2001
From: Soeb-aryn <soebh@aryn.ai>
Date: Mon, 9 Sep 2024 13:14:40 -0700
Subject: [PATCH] improving ExtractTableProperties and standardizer transforms
 (#773)

* ntsb transform changes

* updating ntsb notebook

* removing old file

* lint fixes

* filename fixes

* linting fix

* linting fix

* moving prompts to a common file

* lint fix

---------

Co-authored-by: Soeb <soebh@aryn.ai>
---
 .../sycamore/llms/prompts/__init__.py         |   4 +
 .../sycamore/llms/prompts/default_prompts.py  |  33 ++
 .../unit/transforms/test_standardizer.py      |   8 +-
 .../transforms/assign_doc_properties.py       |   2 +-
 .../transforms/extract_table_properties.py    |  42 +-
 .../sycamore/transforms/standardizer.py       |  55 ++-
 notebooks/ntsb-demo.ipynb                     | 362 ++++++------------
 7 files changed, 202 insertions(+), 304 deletions(-)

diff --git a/lib/sycamore/sycamore/llms/prompts/__init__.py b/lib/sycamore/sycamore/llms/prompts/__init__.py
index e9de07b52..b3a4d98c3 100644
--- a/lib/sycamore/sycamore/llms/prompts/__init__.py
+++ b/lib/sycamore/sycamore/llms/prompts/__init__.py
@@ -10,6 +10,8 @@
     SchemaZeroShotGuidancePrompt,
     PropertiesZeroShotGuidancePrompt,
     TaskIdentifierZeroShotGuidancePrompt,
+    ExtractTablePropertiesTablePrompt,
+    ExtractTablePropertiesPrompt,
 )
 from sycamore.llms.prompts.default_prompts import _deprecated_prompts
 
@@ -20,6 +22,8 @@
     "TextSummarizerGuidancePrompt",
     "SchemaZeroShotGuidancePrompt",
     "PropertiesZeroShotGuidancePrompt",
+    "ExtractTablePropertiesTablePrompt",
+    "ExtractTablePropertiesPrompt",
 ] + list(_deprecated_prompts.keys())
 
 __all__ = prompts
diff --git a/lib/sycamore/sycamore/llms/prompts/default_prompts.py b/lib/sycamore/sycamore/llms/prompts/default_prompts.py
index 879d2afed..47a096bbc 100644
--- a/lib/sycamore/sycamore/llms/prompts/default_prompts.py
+++ b/lib/sycamore/sycamore/llms/prompts/default_prompts.py
@@ -94,6 +94,39 @@ class TaskIdentifierZeroShotGuidancePrompt(SimplePrompt):
     """
 
 
+class ExtractTablePropertiesPrompt(SimplePrompt):
+    user = """
+            You are given a text string where columns are separated by comma representing either a single column, 
+            or multi-column table each new line is a new row.
+            Instructions:
+            1. Parse the table and return a flattened JSON object representing the key-value pairs of properties 
+            defined in the table.
+            2. Do not return nested objects, keep the dictionary only 1 level deep. The only valid value types 
+            are numbers, strings, and lists.
+            3. If you find multiple fields defined in a row, feel free to split them into separate properties.
+            4. Use camelCase for the key names
+            5. For fields where the values are in standard measurement units like miles, 
+            nautical miles, knots, celsius
+            6. return only the json object between ``` 
+            - include the unit in the key name and only set the numeric value as the value.
+            - e.g. "Wind Speed: 9 knots" should become windSpeedInKnots: 9, 
+            "Temperature: 3°C" should become temperatureInC: 3
+            """
+
+
+class ExtractTablePropertiesTablePrompt(SimplePrompt):
+    user = """
+            You are given a text string where columns are separated by comma representing either a single column, 
+            or multi-column table each new line is a new row.
+            Instructions:
+            1. Parse the table and make decision if key, value pair information can be extracted from it.
+            2. if the table contains multiple cell value corresponding to one key, the key, value pair for such table 
+            cant be extracted.
+            3. return True if table cant be parsed as key value pair.
+            4. return only True or False nothing should be added in the response.
+            """
+
+
 class EntityExtractorMessagesPrompt(SimplePrompt):
     def __init__(self, question: str, field: str, format: Optional[str], discrete: bool = False):
         super().__init__()
diff --git a/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py b/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py
index 9450d64a7..7c8bc5c9c 100644
--- a/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py
+++ b/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py
@@ -18,20 +18,18 @@ def setUp(self):
         )
 
     def test_datetime(self):
-        date_standardizer = DateTimeStandardizer()
-
         output = StandardizeProperty(
-            None, standardizer=date_standardizer, path=[["properties", "entity", "dateTime"]]
+            None, standardizer=DateTimeStandardizer, path=[["properties", "entity", "dateTime"]]
         ).run(self.input)
+
         assert "properties" in output.keys()
         assert "entity" in output.properties.keys()
         assert output.properties.get("entity")["dateTime"] == "March 17, 2023, 14:25 "
         assert output.properties.get("entity")["day"] == date(2023, 3, 17)
 
     def test_location(self):
-        loc_standardizer = LocationStandardizer()
         output = StandardizeProperty(
-            None, standardizer=loc_standardizer, path=[["properties", "entity", "location"]]
+            None, standardizer=LocationStandardizer, path=[["properties", "entity", "location"]]
         ).run(self.input)
 
         assert "properties" in output.keys()
diff --git a/lib/sycamore/sycamore/transforms/assign_doc_properties.py b/lib/sycamore/sycamore/transforms/assign_doc_properties.py
index 00836da9b..952196dd2 100644
--- a/lib/sycamore/sycamore/transforms/assign_doc_properties.py
+++ b/lib/sycamore/sycamore/transforms/assign_doc_properties.py
@@ -7,7 +7,7 @@
 
 class AssignDocProperties(SingleThreadUser, NonGPUUser, Map):
     """
-    The AssignDocProperties transform is used to copy properties from first element pf a specific type
+    The AssignDocProperties transform is used to copy properties from first element of a specific type
     to the parent document. This allows for the consolidation of key attributes at the document level.
 
     Args:
diff --git a/lib/sycamore/sycamore/transforms/extract_table_properties.py b/lib/sycamore/sycamore/transforms/extract_table_properties.py
index 2fcaadf4a..b46dbd350 100644
--- a/lib/sycamore/sycamore/transforms/extract_table_properties.py
+++ b/lib/sycamore/sycamore/transforms/extract_table_properties.py
@@ -7,6 +7,7 @@
 import logging
 from sycamore.transforms.llm_query import LLMTextQueryAgent
 from sycamore.llms import LLM
+from sycamore.llms.prompts import ExtractTablePropertiesPrompt, ExtractTablePropertiesTablePrompt
 
 
 class ExtractTableProperties(SingleThreadUser, NonGPUUser, Map):
@@ -52,42 +53,23 @@ def extract_parent_json(input_string: str) -> str:
 
     @staticmethod
     @timetrace("ExtrKeyVal")
-    def extract_table_properties(parent: Document, property_name: str, llm: LLM) -> Document:
+    def extract_table_properties(
+        parent: Document, property_name: str, llm: LLM, prompt_find_table: str = "", prompt_LLM: str = ""
+    ) -> Document:
         """
         This Method is used to extract key value pair from table using LLM and
         populate it as property of that element.
         """
-        prompt = """
-        You are given a text string where columns are separated by comma representing either a single column, 
-        or multi-column table each new line is a new row.
-        Instructions:
-        1. Parse the table and make decision if key, value pair information can be extracted from it.
-        2. if the table contains multiple cell value corresponding to one key, the key, value pair for such table 
-        cant be extracted.
-        3. return True if table cant be parsed as key value pair.
-        4. return only True or False nothing should be added in the response.
-        """
-        query_agent = LLMTextQueryAgent(prompt=prompt, llm=llm, output_property="keyValueTable", element_type="table")
+        if prompt_find_table == "":
+            prompt_find_table = ExtractTablePropertiesTablePrompt().user
+        query_agent = LLMTextQueryAgent(
+            prompt=prompt_find_table, llm=llm, output_property="keyValueTable", element_type="table"
+        )
         doc = query_agent.execute_query(parent)
 
-        prompt = """
-        You are given a text string where columns are separated by comma representing either a single column, 
-        or multi-column table each new line is a new row.
-        Instructions:
-        1. Parse the table and return a flattened JSON object representing the key-value pairs of properties 
-        defined in the table.
-        2. Do not return nested objects, keep the dictionary only 1 level deep. The only valid value types 
-        are numbers, strings, and lists.
-        3. If you find multiple fields defined in a row, feel free to split them into separate properties.
-        4. Use camelCase for the key names
-        5. For fields where the values are in standard measurement units like miles, 
-        nautical miles, knots, celsius
-        6. return only the json object between ``` 
-        - include the unit in the key name and only set the numeric value as the value.
-        - e.g. "Wind Speed: 9 knots" should become windSpeedInKnots: 9, 
-        "Temperature: 3°C" should become temperatureInC: 3
-        """
-        query_agent = LLMTextQueryAgent(prompt=prompt, llm=llm, output_property=property_name, element_type="table")
+        if prompt_LLM == "":
+            prompt_LLM = ExtractTablePropertiesPrompt().user
+        query_agent = LLMTextQueryAgent(prompt=prompt_LLM, llm=llm, output_property=property_name, element_type="table")
         doc = query_agent.execute_query(parent)
 
         for ele in doc.elements:
diff --git a/lib/sycamore/sycamore/transforms/standardizer.py b/lib/sycamore/sycamore/transforms/standardizer.py
index 068036c26..9de93fd6e 100644
--- a/lib/sycamore/sycamore/transforms/standardizer.py
+++ b/lib/sycamore/sycamore/transforms/standardizer.py
@@ -31,9 +31,10 @@ def fixer(self, text: str) -> Union[str, Tuple[str, date]]:
         """
         pass
 
+    @abstractmethod
     def standardize(self, doc: Document, key_path: List[str]) -> Document:
         """
-        Applies the fixer method to a specific field in the document as defined by the key_path.
+        Abstract method applies the fixer method to a specific field in the document as defined by the key_path.
 
         Args:
             doc (Document): The document to be standardized.
@@ -45,18 +46,7 @@ def standardize(self, doc: Document, key_path: List[str]) -> Document:
         Raises:
             KeyError: If any of the keys in key_path are not found in the document.
         """
-        current = doc
-        for key in key_path[:-1]:
-            if current.get(key, None):
-                current = current[key]
-            else:
-                raise KeyError(f"Key {key} not found in the dictionary among {current.keys()}")
-        target_key = key_path[-1]
-        if current.get(target_key, None):
-            current[target_key] = self.fixer(current[target_key])
-        else:
-            raise KeyError(f"Key {target_key} not found in the dictionary among {current.keys()}")
-        return doc
+        pass
 
 
 class LocationStandardizer(Standardizer):
@@ -118,7 +108,8 @@ class LocationStandardizer(Standardizer):
         "WY": "Wyoming",
     }
 
-    def fixer(self, text: str) -> str:
+    @staticmethod
+    def fixer(text: str) -> str:
         """
         Replaces any US state abbreviations in the text with their full state names.
 
@@ -135,13 +126,42 @@ def replacer(match):
 
         return re.sub(r"\b[A-Z]{2}\b", replacer, text)
 
+    @staticmethod
+    def standardize(doc: Document, key_path: List[str]) -> Document:
+        """
+        Applies the fixer method to a specific field in the document as defined by the key_path.
+
+        Args:
+            doc (Document): The document to be standardized.
+            key_path (List[str]): The path to the field within the document that should be standardized.
+
+        Returns:
+            Document: The document with the standardized field.
+
+        Raises:
+            KeyError: If any of the keys in key_path are not found in the document.
+        """
+        current = doc
+        for key in key_path[:-1]:
+            if current.get(key, None):
+                current = current[key]
+            else:
+                raise KeyError(f"Key {key} not found in the dictionary among {current.keys()}")
+        target_key = key_path[-1]
+        if current.get(target_key, None):
+            current[target_key] = LocationStandardizer.fixer(current[target_key])
+        else:
+            raise KeyError(f"Key {target_key} not found in the dictionary among {current.keys()}")
+        return doc
+
 
 class DateTimeStandardizer(Standardizer):
     """
     A standardizer for transforming date and time strings into a consistent format.
     """
 
-    def fixer(self, raw_dateTime: str) -> Tuple[str, date]:
+    @staticmethod
+    def fixer(raw_dateTime: str) -> Tuple[str, date]:
         """
         Converts a date-time string by replacing periods with colons and parsing it into a date object.
 
@@ -175,7 +195,8 @@ def fixer(self, raw_dateTime: str) -> Tuple[str, date]:
             # Handle any other exceptions
             raise RuntimeError(f"Unexpected error occurred while processing: {raw_dateTime}") from e
 
-    def standardize(self, doc: Document, key_path: List[str]) -> Document:
+    @staticmethod
+    def standardize(doc: Document, key_path: List[str]) -> Document:
         """
         Applies the fixer method to a specific date-time field in the document as defined by the key_path,
         and adds an additional "day" field with the extracted date.
@@ -199,7 +220,7 @@ def standardize(self, doc: Document, key_path: List[str]) -> Document:
                 raise KeyError(f"Key {key} not found in the dictionary among {current.keys()}")
         target_key = key_path[-1]
         if target_key in current.keys():
-            current[target_key], current["day"] = self.fixer(current[target_key])
+            current[target_key], current["day"] = DateTimeStandardizer.fixer(current[target_key])
         else:
             raise KeyError(f"Key {target_key} not found in the dictionary among {current.keys()}")
         return doc
diff --git a/notebooks/ntsb-demo.ipynb b/notebooks/ntsb-demo.ipynb
index 0dcb97513..2177931d2 100644
--- a/notebooks/ntsb-demo.ipynb
+++ b/notebooks/ntsb-demo.ipynb
@@ -2,26 +2,20 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "cca08214-df54-42d3-ba80-4f73bec83ff6",
+   "id": "35d91fa2-a4e2-4651-b12d-10e9cbdcf286",
    "metadata": {},
    "source": [
     "# NTSB demo\n",
     "\n",
     "Sycamore is a **Document Processing System** = **ETL** + **Query Planning/Retrieval**.\n",
     "\n",
-    "https://aryn-public.s3.amazonaws.com/ntsb/59.pdf\n",
+    "https://aryn-public.s3.amazonaws.com/ntsb/22.pdf\n",
     "\n",
     "**ETL steps**:\n",
-    "- extracts Text + Structure:\n",
-    "    - titles,\n",
-    "    - section headers,\n",
-    "    - text paragraphs,\n",
-    "    - figures,\n",
-    "    - tables and their cells\n",
-    "    - captions\n",
-    "    - page headers and footers\n",
-    "    - footnotes\n",
-    "- it then can merge individual chunks into larger chunks that preserve more of the context\n",
+    "- extract Text + Structure: titles, section headers, text paragraphs, figures, tables and their cells, captions, page headers and footers, footnotes\n",
+    "- summarize images\n",
+    "- extract data from tables\n",
+    "- standardize locations and dates\n",
     "\n",
     "**Query Planning/Retrieval**:\n",
     "Having these structural elements and information extracted enables the consumer of these document to have much better control over what is being embedded and how, leading to better retrieval in a RAG system.\n",
@@ -30,23 +24,19 @@
     "**Question answering**:\n",
     "Sycamore can also extract information from a document. We’ll see how it can extract location and dates from NTSB docs, but also specific elements like aircraft type.  \n",
     "\n",
-    "\n",
-    "\n",
     "Once the data is transformed, we can ask questions on it."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f26675d3-14c2-4b75-bfc7-c390ab16f5c8",
+   "id": "d6a4aca2-4e57-4dfa-8b17-f8bf59c8bea2",
    "metadata": {},
    "outputs": [],
    "source": [
     "import sycamore \n",
-    "\n",
     "from sycamore.transforms.partition import ArynPartitioner\n",
     "from sycamore.transforms.summarize_images import SummarizeImages\n",
-    "\n",
     "from sycamore.transforms import (AssignDocProperties, \n",
     "                                ExtractTableProperties, \n",
     "                                StandardizeProperty, \n",
@@ -54,101 +44,49 @@
     "                                DateTimeStandardizer)\n",
     "\n",
     "from sycamore.llms import OpenAI \n",
-    "\n",
     "from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH\n",
-    "\n",
     "from sycamore.utils.pdf_utils import show_pages, enumerate_images_and_tables, display_page_and_table_properties\n",
-    "\n",
-    "import json"
+    "from sycamore.materialize import MaterializeSourceMode  \n",
+    "import json\n",
+    "from pathlib import Path"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f6f7bcf2-7ddb-43fd-bd42-6977e0e3de74",
+   "id": "e83894dd-b7ec-4dea-be19-98e6c22b832b",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH\n",
     "assert ArynConfig.get_aryn_api_key() != \"\", f\"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}\""
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "a8ada540-870e-4e2b-a4a7-ac608ab4df7a",
-   "metadata": {},
-   "source": [
-    "### Initialize Sycamore"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d5db14c5-d7aa-4074-90d8-3cca9988e2e0",
-   "metadata": {
-    "scrolled": true
-   },
+   "id": "0b1c192e-4b4f-4b20-8785-5f93fc2524a0",
+   "metadata": {},
    "outputs": [],
    "source": [
     "ctx = sycamore.init()"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "65358d39-1684-46e5-957d-5156f147c71b",
-   "metadata": {},
-   "source": [
-    "## Load the data\n",
-    "\n",
-    "We're loading NTSB incident reports (pdf documents describing aviation incidents). \n",
-    "The documents consist of a combination of tables, text, and figures.\n",
-    "\n",
-    "We’re loading the data from a public s3 bucket\n",
-    "\n",
-    "Sample document: \n",
-    "https://aryn-public.s3.amazonaws.com/ntsb/59.pdf"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "713ef530-7a1f-482f-a6f6-c072f27624e4",
+   "id": "81327aa6-010d-44ad-a5ce-1be88384af66",
    "metadata": {},
    "outputs": [],
    "source": [
-    "s3_path = \"s3://aryn-public/ntsb/59.pdf\"\n",
-    "llm = OpenAI('gpt-4o-mini')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3b2c8f33-a26f-41c6-9d0d-e06ba8f76c4c",
-   "metadata": {},
-   "source": [
-    "## Define a pipeline using SycamorePartitioner"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c3b1c405-4868-4f11-8baa-8784e18da77e",
-   "metadata": {},
-   "source": [
-    "### Chunk and extract document structure\n",
-    "Extracts Text + Structure:\n",
-    "- titles,\n",
-    "- section headers,\n",
-    "- text paragraphs,\n",
-    "- figures,\n",
-    "- tables and their cells\n",
-    "- captions\n",
-    "- page headers and footers\n",
-    "- footnotes"
+    "s3_path = \"s3://aryn-public/ntsb/22.pdf\"\n",
+    "llm = OpenAI(\"gpt-4o-mini\")\n",
+    "materialize_dir = Path.cwd() / \"cache\"\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5e65b7ca-b915-4b3c-b91c-1ffeae0e04a7",
+   "id": "bc3d902c-1f22-419c-bb50-a79a07bf65df",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -165,155 +103,153 @@
     "    \n",
     "    # Summarize each image element\n",
     "    .transform(SummarizeImages)\n",
-    ")\n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "17ab5ed1-ecdc-437c-94fb-9f467446c410",
-   "metadata": {},
-   "source": [
-    "## Visualize partitioned documents"
+    "\n",
+    "    # Materialize each document\n",
+    "    .materialize(\n",
+    "        path=f\"{materialize_dir}/docset_summarized\",\n",
+    "        source_mode=MaterializeSourceMode.IF_PRESENT)\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6e426c05-aa66-42fa-930b-2b407ce7a75c",
+   "id": "aea93254-9116-4065-a411-338cfa658af2",
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
-    "sample_pages = show_pages(docset, limit=4)\n"
+    "show_pages(docset, limit=25)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f249d7d7-d7af-441b-83f6-1d113ad61b0b",
-   "metadata": {
-    "scrolled": true
-   },
+   "id": "e5612962-1953-4494-bb22-e9d63ee4fb04",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "# Show text representation of images and tables\n",
-    "enumerate_images_and_tables(docset.take(1))"
+    "enumerate_images_and_tables(docset.take_all())"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5919b73b-5f84-429b-98a7-5633406cca3b",
+   "id": "68a4487b-14c6-49a0-88c9-2369d2b59590",
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
-    "# extract properties from tables as key value pair \n",
-    "ds_extracted = docset.transform(cls = ExtractTableProperties,parameters = ['llm_response', llm])\n",
-    "ds_extracted.show(limit=1)"
+    "prompt_LLM = \"\"\"\n",
+    "            You are given a text string where columns are separated by comma representing either a single column, \n",
+    "            or multi-column table each new line is a new row.\n",
+    "            Instructions:\n",
+    "            1. Parse the table and return a flattened JSON object representing the key-value pairs of properties \n",
+    "            defined in the table.\n",
+    "            2. Do not return nested objects, keep the dictionary only 1 level deep. The only valid value types \n",
+    "            are numbers, strings, and lists.\n",
+    "            3. If you find multiple fields defined in a row, feel free to split them into separate properties.\n",
+    "            4. Use camelCase for the key names\n",
+    "            5. For fields where the values are in standard measurement units like miles, \n",
+    "            nautical miles, knots, celsius\n",
+    "            6. return only the json object between ``` \n",
+    "            - include the unit in the key name and only set the numeric value as the value.\n",
+    "            - e.g. \"Wind Speed: 9 knots\" should become windSpeedInKnots: 9, \n",
+    "            \"Temperature: 3°C\" should become temperatureInC: 3\n",
+    "            \"\"\"\n",
+    "\n",
+    "\n",
+    "\n",
+    "ds_extracted = (\n",
+    "    docset\n",
+    "    # Extract properties from tables and save it as key value pair in respective table elements\n",
+    "    .map( lambda doc: ExtractTableProperties.extract_table_properties( doc, property_name = \"table_props\", llm =llm, prompt_LLM=prompt_LLM)) \n",
+    "\n",
+    "    # Materialize document\n",
+    "    .materialize(\n",
+    "        path = f\"{materialize_dir}/docset_extrTblprop\",\n",
+    "        source_mode=MaterializeSourceMode.IF_PRESENT)\n",
+    ")\n",
+    "\n",
+    "ds_extracted.show(limit=6)\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "17de9aa0-b6de-466c-9901-ce93cf126565",
+   "id": "e15c60f4-a451-465a-adc7-f3a8ba9fa0ff",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# AssignDocProperties is used to copy properties from first element of table to the document\n",
-    "element_type = 'table'\n",
-    "property_name = 'llm_response'\n",
-    "\n",
-    "ds_prop1 = ds_extracted.transform(cls = AssignDocProperties, parameters = [element_type, property_name])\n",
-    "ds_prop1.show(limit=1, show_elements = False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ae7ca5fc-b57b-4b2c-a7ba-1d53f03e66a3",
-   "metadata": {},
-   "source": [
-    "##  Inspect extracted information\n",
-    "\n",
-    "##  Notice that dates and locations have been standardized"
+    "# we assign properties from 1st table element to document level \n",
+    "ds_prop1 = (ds_extracted.\n",
+    "            map( lambda doc : AssignDocProperties.assign_doc_properties( doc, element_type=\"table\", property_name = \"table_props\")))\n",
+    "ds_prop1.show(limit=6, show_elements = False)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fb6933bb-92a0-43be-98de-0933c338fc1d",
+   "id": "0e688d84-376b-4a17-b4e7-230455ce1282",
    "metadata": {},
    "outputs": [],
    "source": [
+    "\n",
     "# We noramalize the date and location using LocationStandardizer and DateTimeStandardizer transform\n",
-    "loc_standardizer = LocationStandardizer()\n",
-    "date_standardizer = DateTimeStandardizer()\n",
     "\n",
-    "ds_normd = ds_prop1.transform(cls = StandardizeProperty, standardizer=loc_standardizer, path=[['properties','entity','location']])\n",
-    "ds_normd = ds_prop1.transform(cls = StandardizeProperty, standardizer=date_standardizer, path=[['properties','entity','dateTime']])\n",
     "\n",
-    "ds_normd.show(limit=1, show_elements = False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bab0e9a2-7f24-4fd5-ad15-6b23ac056218",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "show_pages(ds_normd, limit=1)"
+    "ds_normd = (\n",
+    "    ds_prop1\n",
+    "    \n",
+    "    # Converts state abbreviations to their full names.\n",
+    "    .map( lambda doc: LocationStandardizer.standardize(doc, key_path = [\"properties\",\"entity\",\"location\"]))\n",
+    "\n",
+    "    # Converts datetime into a common format\n",
+    "    .map( lambda doc: DateTimeStandardizer.standardize(doc, key_path = [\"properties\",\"entity\",\"dateTime\"]))\n",
+    "\n",
+    "    # Materialize document\n",
+    "    .materialize(\n",
+    "        path=f\"{materialize_dir}/docset_normalized\",\n",
+    "        source_mode=MaterializeSourceMode.IF_PRESENT)\n",
+    ")\n",
+    "ds_normd.show(limit=6, show_elements = False)\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "126182dc-35dd-483a-9236-42b5b48cd6ac",
-   "metadata": {
-    "scrolled": true
-   },
+   "id": "fdb7d396-a46f-4a83-89ca-22a622a26c17",
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Show tables and their conversion to properties\n",
-    "display_page_and_table_properties(ds_normd.take(1))"
+    "display_page_and_table_properties(ds_normd.take())"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4609139a-bb39-452a-8c93-29408d375e64",
+   "id": "a9679f78-eff0-40ef-b17f-ba027189914d",
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
+    "## if removed, remove json import up top\n",
     "from IPython.display import display, HTML\n",
     "for e in ds_normd.take_all()[0].elements:\n",
-    "    if \"table\" in e.type:\n",
+    "    if \"table\" in e.type and e.table is not None :\n",
     "        print(\"Element Type: \", e.type)\n",
     "        print(\"Element Properties: \", json.dumps(e.properties, indent=2, default=str))\n",
     "        display(HTML(e.table.to_html()))"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "98244b4d-2bd3-48a9-8768-9e17fd708fd6",
-   "metadata": {},
-   "source": [
-    "## Indexing the documents for retrieval\n",
-    "\n",
-    "Now that we have extracted the text, partitioned it, labeled the partitions, extracted information and standardized it, we're ready to store it for retrieval"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b9101a62-507c-4784-b9f7-581580fd2a35",
+   "id": "3ae51ea5-fb84-48f9-995e-cfa1779855a4",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -325,7 +261,7 @@
     "    opensearch_host = \"localhost\"\n",
     "    print(\"Assuming we are running outside of a container, using localhost for OpenSearch host\")\n",
     "\n",
-    "index = \"ntsb-bb-2_demo\"\n",
+    "index = \"ntsb-demo-all\"\n",
     "os_client_args = {\n",
     "    \"hosts\": [{\"host\": \"localhost\", \"port\": 9200}],\n",
     "    \"http_compress\": True,\n",
@@ -362,18 +298,10 @@
     "}"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "a4669cc4-be71-463a-8eb2-6474c2f4a24a",
-   "metadata": {},
-   "source": [
-    "### Creating embeddings"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "eabce93b-2767-45c6-ac73-4a8ef491de16",
+   "id": "1fe68c03-b48b-4e8c-bd00-a98cab17f4b6",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -383,23 +311,15 @@
     "    ds_normd\n",
     "    .spread_properties([\"entity\", \"path\"])\n",
     "    .explode()\n",
-    "    .sketch()\n",
-    "    .embed(embedder=SentenceTransformerEmbedder(batch_size=100, model_name=\"sentence-transformers/all-MiniLM-L6-v2\"), num_gpus=0.1)\n",
+    "    .embed(embedder=SentenceTransformerEmbedder(batch_size=100, model_name=\"sentence-transformers/all-MiniLM-L6-v2\"))\n",
+    "    \n",
     ")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "649ab324-104b-4230-af81-f4aa5399da05",
-   "metadata": {},
-   "source": [
-    "### Write the OpenSearch index"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b87a4be8-162d-43a4-a4cf-fde9a91bbb4a",
+   "id": "02abe9f4-77c5-464d-9402-3e217a5d6cfa",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -411,26 +331,10 @@
     ")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "b719ebaa-321c-4ff7-ace9-1be0d39ac9be",
-   "metadata": {},
-   "source": [
-    "## Answer some questions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7561fa59-a7a5-47bb-bc88-c1f4a7900f55",
-   "metadata": {},
-   "source": [
-    "### Create a connection to OpenSearch"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5842651f-dbb1-4a62-acb0-b5febc30046d",
+   "id": "9c4ea2d6-6be0-4f71-b103-d10cd2aad701",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -456,36 +360,20 @@
     "osq = OpenSearchQueryExecutor(os_client_args)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "229a7d7c-9a69-4711-8c51-08a7f966b419",
-   "metadata": {},
-   "source": [
-    "### Question"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0c07a7fb-8125-4e00-a891-63b433f5d4f0",
+   "id": "49f0fcde-8351-4b3c-8d9e-8c17b5b5342b",
    "metadata": {},
    "outputs": [],
    "source": [
-    "question = \"Were there any incidents involving red planes\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "70d7dbc8-b99c-4d22-add4-2a964e65e5ac",
-   "metadata": {},
-   "source": [
-    "### Text query"
+    "question = \"Were there any incidents involving Cirrus airplanes\""
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ec38c1d8-5014-4105-b5bc-f218882cc6ce",
+   "id": "0beefa5e-b497-4a0f-9952-90055e864ab5",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -502,25 +390,17 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "89903619-a1ba-47d2-8089-6779f785e193",
+   "id": "f51ca656-68fc-4ee5-9a3e-81811feda2bf",
    "metadata": {},
    "outputs": [],
    "source": [
     "result"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "326a9029-7eb1-48a9-a31b-e7e7c7fe42f0",
-   "metadata": {},
-   "source": [
-    "### RAG query"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f7737adc-e1a5-493a-a0c6-5cbffb15de3a",
+   "id": "af60fd78-cfda-475b-b3ba-55256c8771fa",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -549,7 +429,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7a4599fc-245c-45ac-a960-d238910f19c6",
+   "id": "61a10458-097c-4b6e-8519-9911923b16f2",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -598,7 +478,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "95caade8-09d8-4a85-8029-78ebb37a82ef",
+   "id": "22f9aeb9-7907-4aea-979a-0a34f41d0dbb",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -608,39 +488,19 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0d53cb61-f133-4a70-b2f6-7f8d1eec4715",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "rag_result"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f15de938-7f93-4e23-804c-efefc70ecdaa",
-   "metadata": {},
-   "source": [
-    "### Filtered RAG query"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c1621363-bc1e-4a0b-85c2-3b871f3ef4a2",
+   "id": "686ef630-015a-45ee-b270-35eb769b13cc",
    "metadata": {},
    "outputs": [],
    "source": [
     "filtered_rag_query = OpenSearchQuery()\n",
     "filtered_rag_query[\"index\"] = index\n",
-    "question = \"Were there any airplace incidents in Texas in January 14, 2023?\"\n",
+    "question = \"Were there any airplace incidents in Indiana on January 24, 2023?\"\n",
     "filter = {\n",
     "    \"bool\": {\n",
     "        \"must\": [\n",
     "          {\n",
     "            \"match_phrase\": {\n",
-    "              \"properties.entity.location\": \"Texas\"\n",
+    "              \"properties.entity.location\": \"Indiana\"\n",
     "            }\n",
     "          }\n",
     "          \n",
@@ -697,7 +557,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6d04afa5-7380-4265-a250-c6c9559e394b",
+   "id": "f07ac2f7-030c-425f-a46b-cb338cd85628",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -707,7 +567,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "adfb56bc-e6a4-4f46-a5cd-4e79b8291675",
+   "id": "fb8528fa-b9f9-4bdf-8af0-825d11bfeff5",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -717,7 +577,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ffb8b670-ac29-4bf7-ac5b-0734dcd7abad",
+   "id": "45b6c6a4-144e-4823-b34b-139e3fb85b93",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -739,7 +599,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,