elastic · gareth-ellis · Dec 16, 2024 · Jun 10, 2024 · Nov 28, 2024
diff --git a/cohere_vector/index-vectors-only-mapping.json b/cohere_vector/index-vectors-only-mapping.json
@@ -12,7 +12,7 @@
     "dynamic": false,
     {%- if build_flavor != "serverless" -%}
     "_source": {
-      "enabled": false
+      "mode": "synthetic"
     },
     {%- endif -%}
     "properties": {

diff --git a/it/test_all_tracks_and_challenges.py b/it/test_all_tracks_and_challenges.py
@@ -21,7 +21,7 @@
 
 
 class TestTrackRepository:
-    skip_tracks = ["elastic/logs", "elastic/security", "k8s_metrics", "sql", "elser-ingest-speedtest"]
+    skip_tracks = ["elastic/logs", "elastic/security", "k8s_metrics", "sql", "elser-ingest-speedtest", "msmarco-v2-vector"]
     disable_assertions = {
         "http_logs": ["append-no-conflicts", "runtime-fields"],
         "nyc_taxis": ["update-aggs-only"],

diff --git a/it_serverless/test_selected_tracks_and_challenges.py b/it_serverless/test_selected_tracks_and_challenges.py
@@ -35,13 +35,14 @@ class TestTrackRepository:
         "github_archive",
         "http_logs",
         # "k8s_metrics", (slow)
+        # "msmarco-v2-vector", (slow for test mode)
         "nested",
         "noaa",
         "nyc_taxis",
         "percolator",
         "pmc",
         "so",
-        "so_vector",
+        # "so_vector", (excludes in _source)
         # "sql", (no support for test mode)
         "tsdb",
         "tsdb_k8s_queries",

diff --git a/msmarco-v2-vector/README.md b/msmarco-v2-vector/README.md
@@ -1,9 +1,8 @@
 ## msmarco-v2 vector track
 
 This track benchmarks the dataset from [Cohere/msmarco-v2-embed-english-v3](https://huggingface.co/datasets/Cohere/msmarco-v2-embed-english-v3).
-
-Given the size of this dataset 138.3M documents with 1024 dimension vectors you
-need a cluster with at least 60GB of total RAM available to run performant HNSW queries.
+The corpus contains the original 138M passages of the [MSMARCO (passage, version 2)](https://ir-datasets.com/msmarco-passage-v2.html) corpus embedded
+into 1024 dimensional vectors with the [Cohere `embed-english-v3.0` model](https://cohere.com/blog/introducing-embed-v3).
 
 ### Generating the document dataset
 
@@ -42,20 +41,32 @@ This will build 47 `cohere-documents-XX.json` file for the entire dataset of 138
 
 ### Generating the queries
 
-The `queries.json` can be rebuilt using the `_tools/parse_queries.py`, this will load the msmarco v2 passages queries dataset, and then call the Cohere embed API for each query, and store the embeddings in `queries.json`.
+The `queries.json` can be rebuilt using `_tools/parse_queries.py -t`, this will load the msmarco v2 passages queries dataset, and then call the Cohere embed API for each query, and store the embeddings in `queries.json`.
 This will take a very long time, maybe grab a ☕️ ?
 
 You will need a production API key from [Cohere](https://dashboard.cohere.com/api-keys), as the trial keys are heavily rate-limited:
 
 ```console
 $ export COHERE_API_KEY='abcdefghijklmnopqrstuvwxyz'
-$ python _tools/parse_queries.py
+$ python _tools/parse_queries.py -t
+```
+
+### Generating the queries for the recall operation
+
+The `queries-recall.json` can be rebuilt using `_tools/parse_queries.py -r`, this will load the msmarco v2 passages test queries, and then call the Cohere embed API for each query, store the embeddings in `queries-recall.json` as well as the true top 1000 for each query computed with brute force.
+For the relevance metrics, the `qrels.tsv` file contains annotations for all the queries listed in `queries.json`. This file is copied from the original training data available at [msmarco-passage-v2/trec-dl-2022/judged](https://ir-datasets.com/msmarco-passage-v2.html#msmarco-passage-v2).
+
+```console
+$ export COHERE_API_KEY='abcdefghijklmnopqrstuvwxyz'
+$ python _tools/parse_queries.py -r
 ```
 
 ### Parameters
 
 This track accepts the following parameters with Rally 0.8.0+ using `--track-params`:
 
+ - `aggressive_merge_policy` (default: false): Whether to apply a more aggressive merge strategy.
+ - `index_refresh_interval` (default: unset): The index refresh interval.
  - `initial_indexing_bulk_indexing_clients` (default: 5)
  - `initial_indexing_ingest_percentage` (default: 100)
  - `initial_indexing_bulk_size` (default: 500)
@@ -64,10 +75,10 @@ This track accepts the following parameters with Rally 0.8.0+ using `--track-par
  - `number_of_replicas` (default: 0)
  - `parallel_indexing_bulk_clients` (default: 1)
  - `parallel_indexing_bulk_target_throughput` (default: 1)
- - `parallel_indexing_time_period` (default: 1800)
  - `parallel_indexing_search_clients` (default: 3)
  - `parallel_indexing_search_target_throughput` (default: 100)
  - `post_ingest_sleep` (default: false): Whether to pause after ingest and prior to subsequent operations.
  - `post_ingest_sleep_duration` (default: 30): Sleep duration in seconds.
+ - `search_ops` (default: [(10, 20, 0), (10, 20, 20), (10, 50, 0), (10, 50, 20), (10, 100, 0), (10, 100, 20), (10, 200, 0), (10, 200, 20), (10, 500, 0), (10, 500, 20), (10, 1000, 0), (10, 1000, 20), (100, 120, 0), (100, 120, 120), (100, 200, 0), (100, 200, 120), (100, 500, 0), (100, 500, 120), (100, 1000, 0), (100, 1000, 120)]): The search and recall operations to run (k, ef_search, num_rescore).
  - `standalone_search_iterations` (default: 10000)
  - `vector_index_type` (default: "int8_hnsw"): The index kind for storing the vectors.
diff --git a/msmarco-v2-vector/_tools/parse_queries.py b/msmarco-v2-vector/_tools/parse_queries.py
@@ -1,15 +1,33 @@
+import argparse
 import asyncio
 import json
+import sys
 from os import environ
 
 import ir_datasets
 import numpy
 import vg
 from cohere import AsyncClient
+from elasticsearch import AsyncElasticsearch
 
 DATASET_NAME: str = "msmarco-passage-v2/train"
+RECALL_DATASET_NAME: str = "msmarco-passage-v2/trec-dl-2022/judged"
 OUTPUT_FILENAME: str = "queries.json"
-MAX_DOCS = 12_000
+OUTPUT_RECALL_FILENAME: str = "queries-recall.json"
+MAX_DOCS: int = 12_000
+REQUEST_TIMEOUT: int = 60 * 60 * 5
+
+
+def get_brute_force_query(emb):
+    return {
+        "script_score": {
+            "query": {"match_all": {}},
+            "script": {
+                "source": "double value = dotProduct(params.query_vector, 'emb'); return sigmoid(1, Math.E, -value);",
+                "params": {"query_vector": emb},
+            },
+        }
+    }
 
 
 async def retrieve_embed_for_query(co, text):
@@ -25,7 +43,6 @@ async def output_queries(queries_file):
     async with AsyncClient(environ["COHERE_API_KEY"]) as co:
         co_queries = []
         for query in dataset.queries_iter():
-            print(query)
             co_queries.append(query.text)
 
             # Run our async requests every 100 queries *or* as soon as we
@@ -42,10 +59,45 @@ async def output_queries(queries_file):
     queries_file.write("\n".join(json.dumps(embed) for embed in output))
 
 
-async def main():
+async def output_recall_queries(queries_file):
+    async with AsyncElasticsearch(
+        "https://localhost:19200/", basic_auth=("esbench", "super-secret-password"), verify_certs=False, request_timeout=REQUEST_TIMEOUT
+    ) as es:
+        dataset = ir_datasets.load("msmarco-passage-v2/trec-dl-2022/judged")
+        async with AsyncClient(environ["COHERE_API_KEY"]) as co:
+            count = 0
+            for query in dataset.queries_iter():
+                emb = await retrieve_embed_for_query(co, query[1])
+                resp = await es.search(
+                    index="msmarco-v2", query=get_brute_force_query(emb), size=1000, _source=["_none_"], fields=["docid"]
+                )
+                ids = [(hit["fields"]["docid"][0], hit["_score"]) for hit in resp["hits"]["hits"]]
+                line = {"query_id": query[0], "text": query[1], "emb": emb, "ids": ids}
+                queries_file.write(json.dumps(line) + "\n")
+                count += 1
+
+
+async def create_queries():
     with open(OUTPUT_FILENAME, "w") as queries_file:
         await output_queries(queries_file)
 
 
+async def create_recall_queries():
+    with open(OUTPUT_RECALL_FILENAME, "w") as queries_file:
+        await output_recall_queries(queries_file)
+
+
 if __name__ == "__main__":
-    asyncio.run(main())
+    parser = argparse.ArgumentParser(description="Create queries for throughput or recall operations")
+    parser.add_argument("-t", "--throughput", help="Create queries for throughput operations", action="store_true")
+    parser.add_argument("-r", "--recall", help="Create queries for recall operations", action="store_true")
+
+    if len(sys.argv) == 1:
+        # Neither -t or -r was called, show the options
+        parser.print_help(sys.stderr)
+    args = parser.parse_args()
+    loop = asyncio.get_event_loop()
+    if args.throughput:
+        loop.run_until_complete(create_queries())
+    if args.recall:
+        loop.run_until_complete(create_recall_queries())
diff --git a/msmarco-v2-vector/_tools/requirements.txt b/msmarco-v2-vector/_tools/requirements.txt
@@ -1,3 +1,5 @@
+argparse
+elasticsearch[async]
 cohere
 datasets
 ir-datasets

diff --git a/msmarco-v2-vector/challenges/default.json b/msmarco-v2-vector/challenges/default.json
@@ -42,61 +42,69 @@
         "retry-until-success": true,
         "include-in-reporting": false
       }
-    },
-    {# serverless-post-ingest-sleep-marker-start #}{%- if post_ingest_sleep|default(false) -%}
+    }
+    {# serverless-post-ingest-sleep-marker-start #}{%- if post_ingest_sleep|default(false) -%},
     {
       "name": "post-ingest-sleep",
       "operation": {
         "operation-type": "sleep",
         "duration": {{ post_ingest_sleep_duration|default(30) }}
       }
-    },
+    }
     {%- endif -%}{# serverless-post-ingest-sleep-marker-end #}
+    {%- for i in range(p_search_ops|length) %},
     {
-      "name": "standalone-search-knn-10-100-single-client",
-      "operation": "knn-search-10-100",
-      "warmup-iterations": 100,
+      {%- if p_search_ops[i][2] > 0 -%}
+        "name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}-single-client",
+        "operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
+      {%- else -%}
+        "name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-single-client",
+        "operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
+      {%- endif -%},
+      "warmup-iterations": 1000,
       "iterations": {{ standalone_search_iterations | default(10000) | int }}
     },
     {
-      "name": "standalone-knn-search-100-1000-single-client",
-      "operation": "knn-search-100-1000",
-      "warmup-iterations": 100,
-      "iterations": {{ standalone_search_iterations | default(10000) | int }}
-    },
-    {
-      "name": "standalone-search-knn-10-100-multiple-clients",
-      "operation": "knn-search-10-100",
-      "warmup-iterations": 100,
+      {%- if p_search_ops[i][2] > 0 -%}
+        "name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}-multiple-clients",
+        "operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
+      {%- else -%}
+        "name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-multiple-clients",
+        "operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
+      {%- endif -%},
+      "warmup-iterations": 1000,
       "clients": {{ standalone_search_clients | default(8) | int }},
       "iterations": {{ standalone_search_iterations | default(10000) | int }}
-    },
-    {
-      "name": "standalone-search-knn-100-1000-multiple-clients",
-      "operation": "knn-search-100-1000",
-      "warmup-iterations": 100,
-      "clients": {{ standalone_search_clients | default(8) | int }},
-      "iterations": {{ standalone_search_iterations | default(10000) | int }}
-    },
+    }
+    {%- endfor %},
     {
       "parallel": {
+        "completed-by": "parallel-documents-indexing-bulk",
         "tasks": [
           {
             "name": "parallel-documents-indexing-bulk",
             "operation": "parallel-documents-indexing",
+            "warmup-time-period": 60,
             "clients": {{ parallel_indexing_bulk_clients | default(1) | int }},
-            "time-period": {{ parallel_indexing_time_period | default(1800) | int }},
             "target-throughput": {{ parallel_indexing_bulk_target_throughput | default(1) | int }}
           },
           {
             "name": "parallel-documents-indexing-search-knn-10-100",
             "operation": "knn-search-10-100",
-            "clients": {{ parallel_indexing_search_clients | default(3) | int }},
-            "time-period": {{ parallel_indexing_time_period | default(1800) | int }},
-            "target-throughput": {{ parallel_indexing_search_target_throughput | default(100) | int }}
+            "warmup-time-period": 60,
+            "clients": {{ parallel_indexing_search_clients | default(1) | int }}
           }
         ]
       }
     }
+    {%- for i in range(p_search_ops|length) %},
+    {
+      {%- if p_search_ops[i][2] > 0 -%}
+        "operation": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
+      {%- else -%}
+        "operation": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
+      {%- endif -%}
+    }
+    {%- endfor %}
   ]
 }
diff --git a/msmarco-v2-vector/index-vectors-only-mapping.json b/msmarco-v2-vector/index-vectors-only-mapping.json
@@ -1,19 +1,36 @@
 {
   "settings": {
     {# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%}
+    "index": {
+      {%- if index_refresh_interval is defined %}
+      "index_refresh_interval": {{ index_refresh_interval | tojson }},
+      {%- endif %}
       {% if preload_pagecache %}
-    "index.store.preload": [ "vec", "vex", "vem"],
+      "store.preload": [ "vec", "vex", "vem"],
       {% endif %}
-    "index.number_of_shards": {{number_of_shards | default(1)}},
-    "index.number_of_replicas": {{number_of_replicas | default(0)}}
+      "number_of_shards": {{number_of_shards | default(1)}},
+      "number_of_replicas": {{number_of_replicas | default(0)}}
+      {% if aggressive_merge_policy %},
+      "merge": {
+        "policy": {
+          "max_merged_segment": "25gb",
+          "floor_segment": "1gb",
+          "segments_per_tier": 5
+        }
+      }
+      {% endif %}
+    }
     {%- endif -%}{# non-serverless-index-settings-marker-end #}
   },
   "mappings": {
     "dynamic": false,
     "_source": {
-      "enabled": false
+      "mode": "synthetic"
     },
     "properties": {
+      "docid": {
+        "type": "keyword"
+      },
       "emb": {
         "type": "dense_vector",
         "element_type": "float",

diff --git a/msmarco-v2-vector/operations/default.json b/msmarco-v2-vector/operations/default.json
@@ -22,20 +22,34 @@
   "name": "parallel-documents-indexing",
   "operation-type": "bulk",
   "corpora": "msmarco-v2-parallel-indexing",
-  "bulk-size": {{parallel_indexing_bulk_size | default(500)}},
+  "bulk-size": {{parallel_indexing_bulk_size | default(50)}},
   "ingest-percentage": {{parallel_indexing_ingest_percentage | default(100)}}
-},
+}
+{%- set p_search_ops = (search_ops | default([(10, 20, 0), (10, 20, 20), (10, 50, 0), (10, 50, 20), (10, 100, 0), (10, 100, 20), (10, 200, 0), (10, 200, 20), (10, 500, 0), (10, 500, 20), (10, 1000, 0), (10, 1000, 20), (100, 120, 0), (100, 120, 120), (100, 200, 0), (100, 200, 120), (100, 500, 0), (100, 500, 120), (100, 1000, 0), (100, 1000, 120)]))%}
+{%- for i in range(p_search_ops|length) %},
 {
-  "name": "knn-search-10-100",
+  {%- if p_search_ops[i][2] > 0 -%}
+    "name": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
+  {%- else -%}
+    "name": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
+  {%- endif -%},
   "operation-type": "search",
   "param-source": "knn-param-source",
-  "k": 10,
-  "num-candidates": 100
+  "k": {{p_search_ops[i][0]}},
+  "num-candidates": {{p_search_ops[i][1]}},
+  "num-rescore": {{p_search_ops[i][2]}}
 },
 {
-  "name": "knn-search-100-1000",
-  "operation-type": "search",
-  "param-source": "knn-param-source",
-  "k": 100,
-  "num-candidates": 1000
+  {%- if p_search_ops[i][2] > 0 -%}
+    "name": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
+  {%- else -%}
+    "name": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
+  {%- endif -%},
+  "operation-type": "knn-recall",
+  "param-source": "knn-recall-param-source",
+  "k": {{p_search_ops[i][0]}},
+  "num-candidates": {{p_search_ops[i][1]}},
+  "num-rescore": {{p_search_ops[i][2]}},
+  "include-in-reporting": false
 }
+{%- endfor %}