Skip to content

Commit

Permalink
Fixup notebook tests (#933)
Browse files Browse the repository at this point in the history
* Add script to run tests that will
  * optionally run in both local and ray mode
  * split between fast and slow tests
  * verify all notebooks are accounted
* Fix file scans so they can run with invalid AWS credentials in environment, same trick
  as for materialize, try being anonymous if default doesn't work.
* Fix ndd_example notebook to not specify parallelism.
* Fix tutorial notebook to use materialize to avoid some re-execution
* add some logging
* lock in the new 0.1.24 sycamore release
* ignore notebooks/tmp
  • Loading branch information
eric-anderson authored Oct 18, 2024
1 parent f6c47fd commit f0a358c
Show file tree
Hide file tree
Showing 10 changed files with 265 additions and 14 deletions.
78 changes: 77 additions & 1 deletion .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,45 @@ jobs:
run: poetry run pytest remote_processors/test/unit/
working-directory: lib/remote-processors

notebook-tests-fast:
runs-on: integ-test-runner2
strategy:
matrix:
python-version: ["3.9"]
services:
opensearch:
image: opensearchproject/opensearch:2.10.0
env:
discovery.type: "single-node"
ports:
- 9200:9200

steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: Install sycamore
run: poetry install --all-extras
- name: Download nltk packages
run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger
- name: Update Apt
run: sudo apt-get update
- name: Install poppler and tesseract
run: sudo apt-get install -y poppler-utils tesseract-ocr
- name: Configure AWS Credentials via OIDC provider.
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ
- name: Run Notebook tests
run: ./notebooks/run-notebook-tests.sh --fast

integ-tests:
runs-on: integ-test-runner2
strategy:
Expand Down Expand Up @@ -175,5 +214,42 @@ jobs:
run: sudo chmod -R 777 /neo4j/import
- name: Run Integ tests
run: poetry run pytest lib/sycamore/sycamore/tests/integration

notebook-tests-slow:
runs-on: integ-test-runner2
strategy:
matrix:
python-version: ["3.9"]
services:
opensearch:
image: opensearchproject/opensearch:2.10.0
env:
discovery.type: "single-node"
ports:
- 9200:9200

steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: Install sycamore
run: poetry install --all-extras
- name: Download nltk packages
run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger
- name: Update Apt
run: sudo apt-get update
- name: Install poppler and tesseract
run: sudo apt-get install -y poppler-utils tesseract-ocr
- name: Configure AWS Credentials via OIDC provider.
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ
- name: Run Notebook tests
run: poetry run pytest --nbmake --nbmake-timeout=600 notebooks/default-prep-script.ipynb notebooks/jupyter_dev_example.ipynb notebooks/metadata-extraction.ipynb notebooks/sycamore_demo.ipynb notebooks/tutorial.ipynb
run: ./notebooks/run-notebook-tests.sh --slow
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ notebooks/default-prep-data
.python-version
apps/timetrace/ttviz
notebooks/pc-tutorial
notebooks/tmp
30 changes: 29 additions & 1 deletion lib/sycamore/sycamore/connectors/file/file_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,15 @@ def __init__(
**resource_args,
):
super().__init__(**resource_args)
assert len(paths) > 0
if isinstance(paths, str):
paths = [paths]
assert isinstance(paths, list)
self._paths = paths
self._filesystem = filesystem
if self._filesystem is None:
self._try_infer_fs()

assert parallelism is None, "Use override_num_blocks; remove parameter after 2025-03-01"
self.override_num_blocks = override_num_blocks

Expand All @@ -95,6 +102,28 @@ def _is_s3_scheme(self) -> bool:
else:
return all(path.startswith("s3:") for path in self._paths)

def _try_infer_fs(self):
from sycamore.utils.pyarrow import infer_fs

common_fs = None
new_paths = []
for p in self._paths:
(fs, root) = infer_fs(p)
new_paths.append(root)
if common_fs is None:
common_fs = fs
if not isinstance(fs, common_fs.__class__):
logger.warning(
f"Different paths infer multiple filesystems. {self._paths[0]}"
+ f" gives {common_fs.__class__.__name__} and {p} gives"
+ f" {fs.__class__.__name__}. Using no fs and hoping."
)
return

assert common_fs is not None
self._filesystem = common_fs
self._paths = new_paths

@abstractmethod
def process_file(self, file_info: FileInfo) -> list[Document]:
pass
Expand Down Expand Up @@ -155,7 +184,6 @@ def __init__(
filesystem=filesystem,
**resource_args,
)
self._paths = paths
self._binary_format = binary_format
self._metadata_provider = metadata_provider
self._filter_paths_by_extension = filter_paths_by_extension
Expand Down
1 change: 1 addition & 0 deletions lib/sycamore/sycamore/materialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def _maybe_anonymous(self):
fs.get_file_info(str(self._root))
self._fs = fs
self._fshelper = _PyArrowFsHelper(self._fs)
logger.info(f"Successfully read path {self._root} with anonymous S3")
return
except OSError as e:
logging.warning(
Expand Down
28 changes: 28 additions & 0 deletions lib/sycamore/sycamore/utils/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,37 @@ def infer_fs(path: str) -> Tuple[FileSystem, str]:
from pyarrow import fs

(fs, root) = fs.FileSystem.from_uri(str(path))

fs = maybe_use_anonymous_s3_fs(fs, root)

return (fs, root)


def maybe_use_anonymous_s3_fs(fs: FileSystem, root: str) -> FileSystem:
from pyarrow.fs import S3FileSystem

if not isinstance(fs, S3FileSystem):
return fs

try:
fs.get_file_info(root)
return fs
except OSError as e:
logger.warning(f"Got error {e} trying to get file info on {root}, trying again in anonymous mode")

new_fs = S3FileSystem(anonymous=True)
try:
new_fs.get_file_info(root)
logger.info(f"Successfully read path {root} with anonymous S3")
return new_fs
except OSError as e:
logger.warning(
f"Got error {e} trying to get file info on {root} in anonymous mode. Using previous, non-working fs"
)

return fs


def cross_check_infer_fs(filesystem: Optional[FileSystem], path: str) -> Tuple[FileSystem, str]:
if filesystem is None:
return infer_fs(path)
Expand Down
File renamed without changes.
8 changes: 3 additions & 5 deletions notebooks/ndd_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,6 @@
"metadata": {},
"outputs": [],
"source": [
"parallelism = max(2, multiprocessing.cpu_count() // 2) if save_resources else -1\n",
"\n",
"tokenizer = HuggingFaceTokenizer('thenlper/gte-small')\n",
"embedder = SentenceTransformerEmbedder(model_name='sentence-transformers/all-MiniLM-L6-v2', batch_size=100)\n",
"\n",
Expand All @@ -157,12 +155,12 @@
"if use_json:\n",
" # Fast way: pre-processed DocSet as JSON...\n",
" path = 's3://aryn-public/cccmad-json'\n",
" ds = ctx.read.json_document(path, filesystem=fsys, parallelism=parallelism)\n",
" ds = ctx.read.json_document(path, filesystem=fsys)\n",
"else:\n",
" # Slow way: process PDF documents via Sycamore pipeline...\n",
" path = 's3://aryn-public/cccmad'\n",
" ds = (\n",
" ctx.read.binary(path, binary_format='pdf', filesystem=fsys, parallelism=parallelism)\n",
" ctx.read.binary(path, binary_format='pdf', filesystem=fsys)\n",
" .partition(partitioner=ArynPartitioner())\n",
" .regex_replace(COALESCE_WHITESPACE)\n",
" .mark_bbox_preset(tokenizer=tokenizer)\n",
Expand Down Expand Up @@ -342,7 +340,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down
112 changes: 112 additions & 0 deletions notebooks/run-notebook-tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/bin/bash
main() {
config
cd notebooks
if [[ "$1" == --fast ]]; then
echo "Testing fast notebooks..."
local start=$(date +%s)
test_notebooks "${FAST_NOTEBOOKS[@]}"
local end=$(date +%s)
local elapsed=$(expr $end - $start)
echo "-------------------------------------------------------"
echo "--- Testing fast notebooks took $elapsed/$FAST_MAXTIME seconds"
if [[ $elapsed -ge $FAST_MAXTIME ]]; then
echo "--- Fast tests took too long."
echo "--- Move some fast tests to slow,"
echo "--- or speed up the fast notebooks."
exit 1
fi
echo "-------------------------------------------------------"
check_coverage
elif [[ "$1" == --slow ]]; then
echo "Testing slow notebooks..."
test_notebooks "${SLOW_NOTEBOOKS[@]}"
else
echo "Usage: $0 {--fast,--slow}"
exit 1
fi
exit 0
}

config() {
FAST_NOTEBOOKS=(
default-prep-script.ipynb # 20+40s on Eric's laptop.
OpenAI-logprob.ipynb # 5s on Eric's laptop.
)

FAST_MAXTIME=120
SLOW_NOTEBOOKS=(
ArynPartitionerExample.ipynb # Gets a crash message, but passes?
ArynPartitionerPython.ipynb
jupyter_dev_example.ipynb # O(n^2) from re-show/take
metadata-extraction.ipynb # processes entire
sycamore_demo.ipynb # O(n^2) from re-show/take
tutorial.ipynb # aryn partitioner on entire sort benchmark
VisualizePartitioner.ipynb
)
EXCLUDE_NOTEBOOKS=(
# No good reason for exclusion, just what was
# not automatically tested before
ArynPartitionerWithLangchain.ipynb # needs langchain to be installed
duckdb-writer.ipynb # timeout
elasticsearch-writer.ipynb # needs elasticsearch db setup
financial-docs-10k-example.ipynb # needs to be fixed like
# tutorial to have assertion, assertion code needs to move to aryn_sdk
ndd_example.ipynb #broken
ntsb-demo.ipynb # needs rps
opensearch_docs_etl.ipynb # timeout
opensearch-writer.ipynb # depends on langchain
pinecone-writer.ipynb # needs pinccone key
query-demo.ipynb # depnds on un-checked in visualize.py
subtask-sample.ipynb # broken -- old style llm prompts
sycamore-tutorial-intermediate-etl.ipynb # timeout
unpickle_query.ipynb # looking for uncommitted file
weaviate-writer.ipynb # path not set to files
)
}

test_notebooks() {
for i in "$@"; do
echo
echo
echo "-------------------------------------------------------------------------"
echo "Starting test on $i as written"
echo "-------------------------------------------------------------------------"

time poetry run pytest --nbmake --nbmake-timeout=600 $i || exit 1

if [[ $(grep -c sycamore.EXEC_LOCAL $i) -ge 1 ]]; then
sed -e 's/sycamore.EXEC_LOCAL/sycamore.EXEC_RAY/' <$i >ray-variant-$i
echo "-------------------------------------------------------------------------"
echo "Starting test on $i with EXEC_RAY"
echo "-------------------------------------------------------------------------"
time poetry run pytest --nbmake --nbmake-timeout=600 ray-variant-$i || exit 1
rm ray-variant-$i
fi
done
}

check_coverage() {
echo "Verifying coverage of all notebooks..."
ls *.ipynb | grep -v '^ray-variant-' >/tmp/notebooks.list
(
for i in "${FAST_NOTEBOOKS[@]}" "${SLOW_NOTEBOOKS[@]}" "${EXCLUDE_NOTEBOOKS[@]}"; do
echo "$i"
done
) >>/tmp/notebooks.list
sort /tmp/notebooks.list | uniq -c | sort -n | grep -v '^ 2 ' >/tmp/notebooks.unlisted
if [[ $(wc -l </tmp/notebooks.unlisted) != 0 ]]; then
echo "ERROR: some notebooks are unlisted."
echo " Add them to FAST_NOTEBOOKS for fast tests that block commits,"
echo " SLOW_NOTEBOOKS for slower tests that do not block commits,"
echo " or EXCLUDE_NOTEBOOKS for notebooks that should not be automatically tested."
echo " Missing noteboos:"
sed 's/^ 1 //' /tmp/notebooks.unlisted
exit 1
fi
rm /tmp/notebooks.list /tmp/notebooks.unlisted
echo "All notebooks are classified as fast, slow or excluded."
}

main "$@"
exit 1
15 changes: 11 additions & 4 deletions notebooks/tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"context = sycamore.init()\n",
"\n",
"# Creating a DocSet\n",
"docset = context.read.binary(paths, parallelism=1, binary_format=\"pdf\")"
"docset = context.read.binary(paths, binary_format=\"pdf\")"
]
},
{
Expand All @@ -45,7 +45,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter\n",
"If the above assertion fails, you need to configure your Aryn key. If you do not have one already, visit https://www.aryn.ai/cloud to get a key.\n",
"\n",
"To configure the key, you can either set the environment variable ARYN_API_KEY and restart jupyter\n",
"or make a yaml file at the specified path in the assertion error that looks like:\n",
"\n",
"```\n",
Expand All @@ -71,7 +73,12 @@
"\n",
"# We are using ArynPartitioner to partion the documents.\n",
"# Sycamore supports pluggable partitioners for different formats.\n",
"docset = docset.partition(partitioner=ArynPartitioner())"
"docset = docset.partition(partitioner=ArynPartitioner())\n",
"\n",
"# Now save the docset after partitioning so we can re-use computations.\n",
"docset = docset.materialize(\"tmp/tutorial/post-partition\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n",
"# Run the docset to complete the partitioning.\n",
"docset.execute()\n"
]
},
{
Expand Down Expand Up @@ -229,7 +236,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit f0a358c

Please sign in to comment.