Fixup notebook tests (#933)

* Add script to run tests that will * optionally run in both local and ray mode * split between fast and slow tests * verify all notebooks are accounted * Fix file scans so they can run with invalid AWS credentials in environment, same trick as for materialize, try being anonymous if default doesn't work. * Fix ndd_example notebook to not specify parallelism. * Fix tutorial notebook to use materialize to avoid some re-execution * add some logging * lock in the new 0.1.24 sycamore release * ignore notebooks/tmp
aryn-ai · Oct 18, 2024 · f0a358c · f0a358c
1 parent f6c47fd
commit f0a358c
Show file tree

Hide file tree

Showing 10 changed files with 265 additions and 14 deletions.
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -105,6 +105,45 @@ jobs:
         run: poetry run pytest remote_processors/test/unit/
         working-directory: lib/remote-processors
 
+  notebook-tests-fast:
+    runs-on: integ-test-runner2
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+    services:
+      opensearch:
+        image: opensearchproject/opensearch:2.10.0
+        env:
+          discovery.type: "single-node"
+        ports:
+          - 9200:9200
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install poetry
+        run: pipx install poetry
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "poetry"
+      - name: Install sycamore
+        run: poetry install --all-extras
+      - name: Download nltk packages
+        run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger
+      - name: Update Apt
+        run: sudo apt-get update
+      - name: Install poppler and tesseract
+        run: sudo apt-get install -y poppler-utils tesseract-ocr
+      - name: Configure AWS Credentials via OIDC provider.
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: us-east-1
+          role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ
+      - name: Run Notebook tests
+        run: ./notebooks/run-notebook-tests.sh --fast
+
   integ-tests:
     runs-on: integ-test-runner2
     strategy:
@@ -175,5 +214,42 @@ jobs:
         run: sudo chmod -R 777 /neo4j/import
       - name: Run Integ tests
         run: poetry run pytest lib/sycamore/sycamore/tests/integration
+
+  notebook-tests-slow:
+    runs-on: integ-test-runner2
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+    services:
+      opensearch:
+        image: opensearchproject/opensearch:2.10.0
+        env:
+          discovery.type: "single-node"
+        ports:
+          - 9200:9200
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install poetry
+        run: pipx install poetry
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "poetry"
+      - name: Install sycamore
+        run: poetry install --all-extras
+      - name: Download nltk packages
+        run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger
+      - name: Update Apt
+        run: sudo apt-get update
+      - name: Install poppler and tesseract
+        run: sudo apt-get install -y poppler-utils tesseract-ocr
+      - name: Configure AWS Credentials via OIDC provider.
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: us-east-1
+          role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ
       - name: Run Notebook tests
-        run: poetry run pytest --nbmake --nbmake-timeout=600 notebooks/default-prep-script.ipynb notebooks/jupyter_dev_example.ipynb notebooks/metadata-extraction.ipynb notebooks/sycamore_demo.ipynb notebooks/tutorial.ipynb
+        run: ./notebooks/run-notebook-tests.sh --slow
diff --git a/.gitignore b/.gitignore
@@ -47,3 +47,4 @@ notebooks/default-prep-data
 .python-version
 apps/timetrace/ttviz
 notebooks/pc-tutorial
+notebooks/tmp
diff --git a/lib/sycamore/sycamore/connectors/file/file_scan.py b/lib/sycamore/sycamore/connectors/file/file_scan.py
@@ -84,8 +84,15 @@ def __init__(
         **resource_args,
     ):
         super().__init__(**resource_args)
+        assert len(paths) > 0
+        if isinstance(paths, str):
+            paths = [paths]
+        assert isinstance(paths, list)
         self._paths = paths
         self._filesystem = filesystem
+        if self._filesystem is None:
+            self._try_infer_fs()
+
         assert parallelism is None, "Use override_num_blocks; remove parameter after 2025-03-01"
         self.override_num_blocks = override_num_blocks
 
@@ -95,6 +102,28 @@ def _is_s3_scheme(self) -> bool:
         else:
             return all(path.startswith("s3:") for path in self._paths)
 
+    def _try_infer_fs(self):
+        from sycamore.utils.pyarrow import infer_fs
+
+        common_fs = None
+        new_paths = []
+        for p in self._paths:
+            (fs, root) = infer_fs(p)
+            new_paths.append(root)
+            if common_fs is None:
+                common_fs = fs
+            if not isinstance(fs, common_fs.__class__):
+                logger.warning(
+                    f"Different paths infer multiple filesystems. {self._paths[0]}"
+                    + f"  gives {common_fs.__class__.__name__} and {p} gives"
+                    + f" {fs.__class__.__name__}.  Using no fs and hoping."
+                )
+                return
+
+        assert common_fs is not None
+        self._filesystem = common_fs
+        self._paths = new_paths
+
     @abstractmethod
     def process_file(self, file_info: FileInfo) -> list[Document]:
         pass
@@ -155,7 +184,6 @@ def __init__(
             filesystem=filesystem,
             **resource_args,
         )
-        self._paths = paths
         self._binary_format = binary_format
         self._metadata_provider = metadata_provider
         self._filter_paths_by_extension = filter_paths_by_extension

diff --git a/lib/sycamore/sycamore/materialize.py b/lib/sycamore/sycamore/materialize.py
@@ -116,6 +116,7 @@ def _maybe_anonymous(self):
             fs.get_file_info(str(self._root))
             self._fs = fs
             self._fshelper = _PyArrowFsHelper(self._fs)
+            logger.info(f"Successfully read path {self._root} with anonymous S3")
             return
         except OSError as e:
             logging.warning(

diff --git a/lib/sycamore/sycamore/utils/pyarrow.py b/lib/sycamore/sycamore/utils/pyarrow.py
@@ -22,9 +22,37 @@ def infer_fs(path: str) -> Tuple[FileSystem, str]:
     from pyarrow import fs
 
     (fs, root) = fs.FileSystem.from_uri(str(path))
+
+    fs = maybe_use_anonymous_s3_fs(fs, root)
+
     return (fs, root)
 
 
+def maybe_use_anonymous_s3_fs(fs: FileSystem, root: str) -> FileSystem:
+    from pyarrow.fs import S3FileSystem
+
+    if not isinstance(fs, S3FileSystem):
+        return fs
+
+    try:
+        fs.get_file_info(root)
+        return fs
+    except OSError as e:
+        logger.warning(f"Got error {e} trying to get file info on {root}, trying again in anonymous mode")
+
+    new_fs = S3FileSystem(anonymous=True)
+    try:
+        new_fs.get_file_info(root)
+        logger.info(f"Successfully read path {root} with anonymous S3")
+        return new_fs
+    except OSError as e:
+        logger.warning(
+            f"Got error {e} trying to get file info on {root} in anonymous mode. Using previous, non-working fs"
+        )
+
+    return fs
+
+
 def cross_check_infer_fs(filesystem: Optional[FileSystem], path: str) -> Tuple[FileSystem, str]:
     if filesystem is None:
         return infer_fs(path)

diff --git a/notebooks/OpenAI logprob.ipynb → notebooks/OpenAI-logprob.ipynb b/notebooks/OpenAI logprob.ipynb → notebooks/OpenAI-logprob.ipynb
diff --git a/notebooks/ndd_example.ipynb b/notebooks/ndd_example.ipynb
@@ -146,8 +146,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "parallelism = max(2, multiprocessing.cpu_count() // 2) if save_resources else -1\n",
-    "\n",
     "tokenizer = HuggingFaceTokenizer('thenlper/gte-small')\n",
     "embedder = SentenceTransformerEmbedder(model_name='sentence-transformers/all-MiniLM-L6-v2', batch_size=100)\n",
     "\n",
@@ -157,12 +155,12 @@
     "if use_json:\n",
     "    # Fast way: pre-processed DocSet as JSON...\n",
     "    path = 's3://aryn-public/cccmad-json'\n",
-    "    ds = ctx.read.json_document(path, filesystem=fsys, parallelism=parallelism)\n",
+    "    ds = ctx.read.json_document(path, filesystem=fsys)\n",
     "else:\n",
     "    # Slow way: process PDF documents via Sycamore pipeline...\n",
     "    path = 's3://aryn-public/cccmad'\n",
     "    ds = (\n",
-    "        ctx.read.binary(path, binary_format='pdf', filesystem=fsys, parallelism=parallelism)\n",
+    "        ctx.read.binary(path, binary_format='pdf', filesystem=fsys)\n",
     "        .partition(partitioner=ArynPartitioner())\n",
     "        .regex_replace(COALESCE_WHITESPACE)\n",
     "        .mark_bbox_preset(tokenizer=tokenizer)\n",
@@ -342,7 +340,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/run-notebook-tests.sh b/notebooks/run-notebook-tests.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+main() {
+    config
+    cd notebooks
+    if [[ "$1" == --fast ]]; then
+        echo "Testing fast notebooks..."
+        local start=$(date +%s)
+        test_notebooks "${FAST_NOTEBOOKS[@]}"
+        local end=$(date +%s)
+        local elapsed=$(expr $end - $start)
+        echo "-------------------------------------------------------"
+        echo "--- Testing fast notebooks took $elapsed/$FAST_MAXTIME seconds"
+        if [[ $elapsed -ge $FAST_MAXTIME ]]; then
+            echo "--- Fast tests took too long."
+            echo "--- Move some fast tests to slow,"
+            echo "--- or speed up the fast notebooks."
+            exit 1
+        fi
+        echo "-------------------------------------------------------"
+        check_coverage
+    elif [[ "$1" == --slow ]]; then
+        echo "Testing slow notebooks..."
+        test_notebooks "${SLOW_NOTEBOOKS[@]}"
+    else
+        echo "Usage: $0 {--fast,--slow}"
+        exit 1
+    fi
+    exit 0
+}
+
+config() {
+    FAST_NOTEBOOKS=(
+        default-prep-script.ipynb # 20+40s on Eric's laptop.
+        OpenAI-logprob.ipynb # 5s on Eric's laptop.
+    )
+
+    FAST_MAXTIME=120
+    SLOW_NOTEBOOKS=(
+        ArynPartitionerExample.ipynb # Gets a crash message, but passes?
+        ArynPartitionerPython.ipynb
+        jupyter_dev_example.ipynb # O(n^2) from re-show/take
+        metadata-extraction.ipynb # processes entire 
+        sycamore_demo.ipynb # O(n^2) from re-show/take
+        tutorial.ipynb # aryn partitioner on entire sort benchmark
+        VisualizePartitioner.ipynb
+    )
+    EXCLUDE_NOTEBOOKS=(
+        # No good reason for exclusion, just what was
+        # not automatically tested before
+        ArynPartitionerWithLangchain.ipynb # needs langchain to be installed
+        duckdb-writer.ipynb # timeout
+        elasticsearch-writer.ipynb # needs elasticsearch db setup
+        financial-docs-10k-example.ipynb # needs to be fixed like
+        # tutorial to have assertion, assertion code needs to move to aryn_sdk
+        ndd_example.ipynb #broken
+        ntsb-demo.ipynb # needs rps
+        opensearch_docs_etl.ipynb # timeout
+        opensearch-writer.ipynb # depends on langchain
+        pinecone-writer.ipynb # needs pinccone key
+        query-demo.ipynb # depnds on un-checked in visualize.py
+        subtask-sample.ipynb # broken -- old style llm prompts
+        sycamore-tutorial-intermediate-etl.ipynb # timeout
+        unpickle_query.ipynb # looking for uncommitted file
+        weaviate-writer.ipynb # path not set to files
+    )
+}
+
+test_notebooks() {
+    for i in "$@"; do
+        echo
+        echo
+        echo "-------------------------------------------------------------------------"
+        echo "Starting test on $i as written"
+        echo "-------------------------------------------------------------------------"
+
+        time poetry run pytest --nbmake --nbmake-timeout=600 $i || exit 1
+
+        if [[ $(grep -c sycamore.EXEC_LOCAL $i) -ge 1 ]]; then
+            sed -e 's/sycamore.EXEC_LOCAL/sycamore.EXEC_RAY/' <$i >ray-variant-$i
+            echo "-------------------------------------------------------------------------"
+            echo "Starting test on $i with EXEC_RAY"
+            echo "-------------------------------------------------------------------------"
+            time poetry run pytest --nbmake --nbmake-timeout=600 ray-variant-$i || exit 1
+            rm ray-variant-$i
+        fi
+    done
+}
+
+check_coverage() {
+    echo "Verifying coverage of all notebooks..."
+    ls *.ipynb | grep -v '^ray-variant-' >/tmp/notebooks.list
+    (
+        for i in "${FAST_NOTEBOOKS[@]}" "${SLOW_NOTEBOOKS[@]}" "${EXCLUDE_NOTEBOOKS[@]}"; do
+            echo "$i"
+        done
+    ) >>/tmp/notebooks.list
+    sort /tmp/notebooks.list | uniq -c | sort -n | grep -v '^      2 ' >/tmp/notebooks.unlisted
+    if [[ $(wc -l </tmp/notebooks.unlisted) != 0 ]]; then
+        echo "ERROR: some notebooks are unlisted."
+        echo "   Add them to FAST_NOTEBOOKS for fast tests that block commits,"
+        echo "   SLOW_NOTEBOOKS for slower tests that do not block commits,"
+        echo "   or EXCLUDE_NOTEBOOKS for notebooks that should not be automatically tested."
+        echo "   Missing noteboos:"
+        sed 's/^      1 //' /tmp/notebooks.unlisted
+        exit 1
+    fi
+    rm /tmp/notebooks.list /tmp/notebooks.unlisted
+    echo "All notebooks are classified as fast, slow or excluded."
+}
+
+main "$@"
+exit 1
diff --git a/notebooks/tutorial.ipynb b/notebooks/tutorial.ipynb
@@ -28,7 +28,7 @@
     "context = sycamore.init()\n",
     "\n",
     "# Creating a DocSet\n",
-    "docset = context.read.binary(paths, parallelism=1, binary_format=\"pdf\")"
+    "docset = context.read.binary(paths, binary_format=\"pdf\")"
    ]
   },
   {
@@ -45,7 +45,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter\n",
+    "If the above assertion fails, you need to configure your Aryn key. If you do not have one already, visit https://www.aryn.ai/cloud to get a key.\n",
+    "\n",
+    "To configure the key, you can either set the environment variable ARYN_API_KEY and restart jupyter\n",
     "or make a yaml file at the specified path in the assertion error that looks like:\n",
     "\n",
     "```\n",
@@ -71,7 +73,12 @@
     "\n",
     "# We are using ArynPartitioner to partion the documents.\n",
     "# Sycamore supports pluggable partitioners for different formats.\n",
-    "docset = docset.partition(partitioner=ArynPartitioner())"
+    "docset = docset.partition(partitioner=ArynPartitioner())\n",
+    "\n",
+    "# Now save the docset after partitioning so we can re-use computations.\n",
+    "docset = docset.materialize(\"tmp/tutorial/post-partition\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n",
+    "# Run the docset to complete the partitioning.\n",
+    "docset.execute()\n"
    ]
   },
   {
@@ -229,7 +236,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,