From 4f52aaea1b0bc4fd93535c7cffb5c9ef3bfb9065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xuebin=20Su=20=28=E8=8B=8F=E5=AD=A6=E6=96=8C=29?= <12034000+xuebinsu@users.noreply.github.com> Date: Thu, 23 Nov 2023 15:34:02 +0800 Subject: [PATCH] Support more index types besides ivfflat (#224) Previously, we only support indexing embeddings using the `ivfflat` access method in [pgvector](https://github.com/pgvector/pgvector). Recently, a new access method `hnsw` has been added to pgvector. `hnsw` is believed to be more performant and accurate than `ivfflat`. To allow for more flexibility, we add a new parameter `method` to allow user to choose which access method to use when creating index. Also, a new parameter `embedding_dimension` is added to support more models, since dimension is required for pgvector to create index. A new test case for embeddings is added in `tests/test_embedding.py`. To support `set allow_system_table_mods = on;`, Postgres is upgraded from 12 to 13 on CI. --- .github/workflows/test.yml | 18 +++++-- concourse/test.sh | 54 ------------------- doc/source/notebooks/embedding.ipynb | 3 +- greenplumpython/dataframe.py | 2 +- greenplumpython/experimental/embedding.py | 40 +++++++++----- server/build.sh | 16 +++++- server/initdb.sh | 2 +- ...erfile => postgres13-python311.Dockerfile} | 4 +- ...kerfile => postgres13-python39.Dockerfile} | 4 +- server/requirements.sh | 6 +++ tests/__init__.py | 6 +-- tests/conftest.py | 26 +++++++++ tests/test_embedding.py | 26 +++++++++ tests/test_use_pickler.py | 1 + tox.ini | 8 +++ 15 files changed, 131 insertions(+), 85 deletions(-) delete mode 100755 concourse/test.sh rename server/{postgres12-python311.Dockerfile => postgres13-python311.Dockerfile} (77%) rename server/{postgres12-python39.Dockerfile => postgres13-python39.Dockerfile} (77%) create mode 100644 server/requirements.sh create mode 100644 tests/test_embedding.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 41bca741..fd621814 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,11 +16,11 @@ jobs: fail-fast: true matrix: python-version: ["3.9", "3.11"] - server: ["postgres12-python39", "postgres12-python311"] + server: ["postgres13-python39", "postgres13-python311"] include: - - server: "postgres12-python39" + - server: "postgres13-python39" server-python-version: "3.9" - - server: "postgres12-python311" + - server: "postgres13-python311" server-python-version: "3.11" steps: @@ -30,11 +30,18 @@ jobs: with: python-version: ${{ matrix.python-version }} cache: pip + - name: Build server image + run: | + docker build \ + -t greenplumpython-server:${{ matrix.server }} \ + -f server/${{ matrix.server }}.Dockerfile \ + server/ - name: Run tests without pickler run: | python3 -m pip install tox~=4.11 tox-docker~=4.1 && \ tox \ - --override=docker:server.dockerfile=server/${{ matrix.server }}.Dockerfile \ + --override=docker:server.image=greenplumpython-server:${{ matrix.server }} \ + --override=docker:server.dockerfile='' \ -e test-container \ -- \ --override-ini=server_use_pickler=false \ @@ -43,7 +50,8 @@ jobs: if: ${{ matrix.python-version == matrix.server-python-version }} run: | tox \ - --override=docker:server.dockerfile=server/${{ matrix.server }}.Dockerfile \ + --override=docker:server.image=greenplumpython-server:${{ matrix.server }} \ + --override=docker:server.dockerfile='' \ -e test-container \ -- \ --override-ini=server_use_pickler=true \ diff --git a/concourse/test.sh b/concourse/test.sh deleted file mode 100755 index cdc39d62..00000000 --- a/concourse/test.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -l - -set -o xtrace -o errexit -o nounset -o pipefail - -install_plpython3() { - if [[ $GP_MAJOR_VERSION == 6 && ! -f "$(pg_config --pkglibdir)/plpython3.so" ]]; then - mkdir -p bin_plpython3/install_tmp - pushd bin_plpython3/install_tmp - find .. -maxdepth 1 -regex ".*-[0-9\.]*-.*\.tar\.gz" -exec tar xfv {} \; - ./install_gpdb_component - popd - fi -} - -start_gpdb_as_gpadmin() { - if ! gpstate; then - source "$CI_REPO_DIR/common/entry_common.sh" - sudo passwd --delete gpadmin # for `su gpadmin` in start_gpdb - if [[ "$(source /etc/os-release && echo "$ID")" == "ubuntu" ]]; then - su () { - sudo "su" "$@" - } - fi - set +o nounset - start_gpdb - source "$HOME/.bashrc" # for gpdemo-env.sh - set -o nounset - fi -} - -setup_testdb() { - local testdb=$1 - # FIXME: The test db and extension creation should be handled by python code. - if ! psql "$testdb" -c ''; then - createdb "$testdb" - fi - psql "$testdb" -c "CREATE EXTENSION IF NOT EXISTS plpython3u;" -} - -_main() { - install_plpython3 - start_gpdb_as_gpadmin - setup_testdb gpadmin - - # Run testing - pushd /home/gpadmin/greenplumpython_src - unset PYTHONPATH - unset PYTHONHOME - python3.9 -m pip install tox - tox -e test - popd -} - -_main diff --git a/doc/source/notebooks/embedding.ipynb b/doc/source/notebooks/embedding.ipynb index 368ebd72..f8b0cdb6 100644 --- a/doc/source/notebooks/embedding.ipynb +++ b/doc/source/notebooks/embedding.ipynb @@ -73,6 +73,7 @@ " distribution_key={\"id\"},\n", " distribution_type=\"hash\",\n", " drop_if_exists=True,\n", + " drop_cascade=True,\n", " )\n", " .check_unique(columns={\"id\"})\n", ")" @@ -128,7 +129,7 @@ "source": [ "import greenplumpython.experimental.embedding\n", "\n", - "t = t.embedding().create_index(column=\"content\", model=\"all-MiniLM-L6-v2\")\n", + "t = t.embedding().create_index(column=\"content\", model_name=\"all-MiniLM-L6-v2\")\n", "t" ] }, diff --git a/greenplumpython/dataframe.py b/greenplumpython/dataframe.py index bad6757c..eba3772f 100644 --- a/greenplumpython/dataframe.py +++ b/greenplumpython/dataframe.py @@ -963,7 +963,7 @@ def save_as( if distribution_type == "replicated" else "RANDOMLY"} """ - if distribution_type is not None + if self._db._is_variant("greenplum") and distribution_type is not None else "" ) if drop_cascade: diff --git a/greenplumpython/experimental/embedding.py b/greenplumpython/experimental/embedding.py index 0bf9247f..a319e5d2 100644 --- a/greenplumpython/experimental/embedding.py +++ b/greenplumpython/experimental/embedding.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, cast +from typing import Any, Callable, Literal, Optional, cast from uuid import uuid4 import greenplumpython as gp @@ -75,7 +75,13 @@ class Embedding: def __init__(self, dataframe: gp.DataFrame) -> None: self._dataframe = dataframe - def create_index(self, column: str, model_name: str) -> gp.DataFrame: + def create_index( + self, + column: str, + model_name: str, + embedding_dimension: Optional[int] = None, + method: Optional[Literal["ivfflat", "hnsw"]] = "hnsw", + ) -> gp.DataFrame: """ Generate embeddings and create index for a column of unstructured data. @@ -96,6 +102,8 @@ def create_index(self, column: str, model_name: str) -> gp.DataFrame: Args: column: name of column to create index on. model_name: name of model to generate embedding. + embedding_dimension: dimension of the embedding. + method: name of the index access method (i.e. index type) in `pgvector `_. Returns: Dataframe with target column indexed based on embeddings. @@ -105,17 +113,17 @@ def create_index(self, column: str, model_name: str) -> gp.DataFrame: """ - import sentence_transformers # type: ignore reportMissingImports - - model = sentence_transformers.SentenceTransformer(model_name) # type: ignore reportUnknownVariableType - assert self._dataframe.unique_key is not None, "Unique key is required to create index." - try: - word_embedding_dimension: int = model[1].word_embedding_dimension # From models.Pooling - except: - raise NotImplementedError( - "Model '{model_name}' doesn't provide embedding dimension information" - ) + if embedding_dimension is None: + try: + import sentence_transformers # type: ignore reportMissingImports + + model = sentence_transformers.SentenceTransformer(model_name) # type: ignore reportUnknownVariableType + embedding_dimension: int = model[1].word_embedding_dimension # From models.Pooling + except: + raise NotImplementedError( + "Model '{model_name}' doesn't provide embedding dimension information" + ) embedding_col_name = "_emb_" + uuid4().hex embedding_df_cols = list(self._dataframe.unique_key) + [embedding_col_name] @@ -126,7 +134,7 @@ def create_index(self, column: str, model_name: str) -> gp.DataFrame: Callable[[gp.DataFrame], TypeCast], # FIXME: Modifier must be adapted to all types of model. # Can this be done with transformers.AutoConfig? - lambda t: gp.type_("vector", modifier=word_embedding_dimension)(_generate_embedding(t[column], model_name)), # type: ignore reportUnknownLambdaType + lambda t: gp.type_("vector", modifier=embedding_dimension)(_generate_embedding(t[column], model_name)), # type: ignore reportUnknownLambdaType ) }, )[embedding_df_cols] @@ -136,8 +144,12 @@ def create_index(self, column: str, model_name: str) -> gp.DataFrame: distribution_type="hash", ) .check_unique(self._dataframe.unique_key) - .create_index(columns={embedding_col_name}, method="ivfflat") ) + if method is not None: + assert method in ["ivfflat", "hnsw"] + embedding_df = embedding_df.create_index( + columns={embedding_col_name: "vector_l2_ops"}, method=method + ) assert self._dataframe._db is not None _record_dependency._create_in_db(self._dataframe._db) sql_add_relationship = f""" diff --git a/server/build.sh b/server/build.sh index 573d968f..4c71006e 100644 --- a/server/build.sh +++ b/server/build.sh @@ -12,10 +12,22 @@ apt-get install --no-install-recommends -y \ python3-venv apt-get autoclean -POSTGRES_USER_SITE=$(su --login postgres --session-command "python3 -m site --user-site") -POSTGRES_USER_BASE=$(su --login postgres --session-command "python3 -m site --user-base") +POSTGRES_USER_SITE=$(su postgres --session-command "python3 -m site --user-site") +POSTGRES_USER_BASE=$(su postgres --session-command "python3 -m site --user-base") mkdir --parents "$POSTGRES_USER_SITE" chown --recursive postgres "$POSTGRES_USER_BASE" cp /tmp/initdb.sh /docker-entrypoint-initdb.d chown postgres /docker-entrypoint-initdb.d/* + +setup_venv() { + python3 -m venv "$HOME"/venv + # shellcheck source=/dev/null + source "$HOME"/venv/bin/activate + + # shellcheck source=/dev/null + source /tmp/requirements.sh +} + +export -f setup_venv +su postgres --session-command 'bash -c setup_venv' diff --git a/server/initdb.sh b/server/initdb.sh index 0400aeff..b1e60053 100644 --- a/server/initdb.sh +++ b/server/initdb.sh @@ -8,5 +8,5 @@ set -o nounset -o xtrace -o errexit -o pipefail echo "log_destination = 'csvlog'" } >>"$PGDATA"/postgresql.conf -python3 -m venv "$HOME"/venv +# shellcheck source=/dev/null source "$HOME"/venv/bin/activate diff --git a/server/postgres12-python311.Dockerfile b/server/postgres13-python311.Dockerfile similarity index 77% rename from server/postgres12-python311.Dockerfile rename to server/postgres13-python311.Dockerfile index 58962428..ad81c6d4 100644 --- a/server/postgres12-python311.Dockerfile +++ b/server/postgres13-python311.Dockerfile @@ -1,6 +1,6 @@ -FROM postgres:12-bookworm +FROM postgres:13-bookworm -COPY build.sh initdb.sh /tmp/ +COPY build.sh initdb.sh requirements.sh /tmp/ RUN bash /tmp/build.sh HEALTHCHECK --interval=1s --timeout=1s --start-period=1s --retries=30 CMD psql \ diff --git a/server/postgres12-python39.Dockerfile b/server/postgres13-python39.Dockerfile similarity index 77% rename from server/postgres12-python39.Dockerfile rename to server/postgres13-python39.Dockerfile index 39de51f7..fc7a6e91 100644 --- a/server/postgres12-python39.Dockerfile +++ b/server/postgres13-python39.Dockerfile @@ -1,6 +1,6 @@ -FROM postgres:12-bullseye +FROM postgres:13-bullseye -COPY build.sh initdb.sh /tmp/ +COPY build.sh initdb.sh requirements.sh /tmp/ RUN bash /tmp/build.sh HEALTHCHECK --interval=1s --timeout=1s --start-period=1s --retries=30 CMD psql \ diff --git a/server/requirements.sh b/server/requirements.sh new file mode 100644 index 00000000..a73289a0 --- /dev/null +++ b/server/requirements.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -o errexit -o nounset -o pipefail -o xtrace + +python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +python3 -m pip install sentence-transformers diff --git a/tests/__init__.py b/tests/__init__.py index fb496f01..13b00b0a 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -28,7 +28,7 @@ def pip_install(requirements: str) -> str: @pytest.fixture(scope="session") -def db(server_use_pickler: bool): +def db(server_use_pickler: bool, server_has_pgvector: bool): # for the connection both work for GitHub Actions and concourse db = gp.database( params={ @@ -41,10 +41,10 @@ def db(server_use_pickler: bool): db._execute( """ CREATE EXTENSION IF NOT EXISTS plpython3u; - CREATE EXTENSION IF NOT EXISTS vector; DROP SCHEMA IF EXISTS test CASCADE; CREATE SCHEMA test; - """, + """ + + ("CREATE EXTENSION IF NOT EXISTS vector;" if server_has_pgvector else ""), has_results=False, ) if server_use_pickler: diff --git a/tests/conftest.py b/tests/conftest.py index 67ac45cb..0d945518 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,9 +8,35 @@ def pytest_addoption(parser: pytest.Parser): default=True, help="Use pickler to deserialize UDFs on server.", ) + parser.addini( + "server_has_pgvector", + type="bool", + default=True, + help="pgvector is available on server.", + ) @pytest.fixture(scope="session") def server_use_pickler(pytestconfig: pytest.Config) -> bool: val: bool = pytestconfig.getini("server_use_pickler") return val + + +@pytest.fixture(scope="session") +def server_has_pgvector(pytestconfig: pytest.Config) -> bool: + val: bool = pytestconfig.getini("server_has_pgvector") + return val + + +def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]): + server_has_pgvector: bool = config.getini("server_has_pgvector") + server_use_pickler: bool = config.getini("server_use_pickler") + xfail_requires_pgvector = pytest.mark.xfail(reason="requires pgvector on server to run") + xfail_requires_pickler_on_server = pytest.mark.xfail( + reason="requires pickler (e.g. dill) on server to run" + ) + for item in items: + if "requires_pgvector" in item.keywords and not server_has_pgvector: + item.add_marker(xfail_requires_pgvector) + if "requires_pickler_on_server" in item.keywords and not server_use_pickler: + item.add_marker(xfail_requires_pickler_on_server) diff --git a/tests/test_embedding.py b/tests/test_embedding.py new file mode 100644 index 00000000..99f397e3 --- /dev/null +++ b/tests/test_embedding.py @@ -0,0 +1,26 @@ +import pytest + +import greenplumpython as gp +from tests import db + + +@pytest.mark.requires_pgvector +def test_embedding_query_string(db: gp.Database): + content = ["I have a dog.", "I like eating apples."] + t = ( + db.create_dataframe(columns={"id": range(len(content)), "content": content}) + .save_as( + temp=True, + column_names=["id", "content"], + distribution_key={"id"}, + distribution_type="hash", + drop_if_exists=True, + drop_cascade=True, + ) + .check_unique(columns={"id"}) + ) + t = t.embedding().create_index(column="content", model_name="all-MiniLM-L6-v2") + df = t.embedding().search(column="content", query="apple", top_k=1) + assert len(list(df)) == 1 + for row in df: + assert row["content"] == "I like eating apples." diff --git a/tests/test_use_pickler.py b/tests/test_use_pickler.py index 06297804..4d9b9dc0 100644 --- a/tests/test_use_pickler.py +++ b/tests/test_use_pickler.py @@ -11,6 +11,7 @@ def test_pickler_option(server_use_pickler: bool): from dataclasses import dataclass +@pytest.mark.requires_pickler_on_server def test_pickler_outside_class(db: gp.Database): @dataclass class Int: diff --git a/tox.ini b/tox.ini index 711f73c9..015b4eb4 100644 --- a/tox.ini +++ b/tox.ini @@ -17,6 +17,7 @@ host = localhost port = 65432 [docker:server] +image = dockerfile = server/postgres12-python39.Dockerfile environment = POSTGRES_USER={[server]user} @@ -28,7 +29,9 @@ ports = [testenv:test-container] docker = server deps = -r requirements-dev.txt +allowlist_externals = bash commands = + bash -c 'source .tox/test-container/bin/activate && source server/requirements.sh' pytest --exitfirst {posargs} setenv = PGHOST={[server]host} @@ -38,7 +41,9 @@ setenv = [testenv:test] deps = -r requirements-dev.txt +allowlist_externals = bash commands = + bash -c 'source .tox/test/bin/activate && source server/requirements.sh' pytest --exitfirst {posargs} setenv = PGHOST={env:PGHOST:localhost} @@ -74,6 +79,9 @@ doctest_optionflags = NORMALIZE_WHITESPACE testpaths = tests greenplumpython +markers = + requires_pickler_on_server + requires_pgvector [pydocstyle] # TODO: Enable docstyle check for all files