Skip to content

Commit

Permalink
Add search/mteb/dbpedia relevance evaluation track (#691)
Browse files Browse the repository at this point in the history
* Initial commit of a dbpedia_ranking relevance evaluation track

* Forcing a CI run

* Move dbpedia into a search/mteb directory

* Rename dbpedia_ranking to dbpedia

* Add performance metrics

* Remove accidental file

* Force CI to run

* Fix typo in track

* Remove dev.tsv

* Remove default to dev.tsv
  • Loading branch information
kderusso authored Oct 23, 2024
1 parent 4cd9d4a commit 151f5d8
Show file tree
Hide file tree
Showing 11 changed files with 44,006 additions and 0 deletions.
8 changes: 8 additions & 0 deletions search/mteb/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
## Search MTEB

This track assesses the search performance of various MTEB datasets.

See the individual MTEB tracks to learn more about that specific track.

## License
Terms and Conditions for using the mteb datasets can be found at https://github.com/embeddings-benchmark/mteb
48 changes: 48 additions & 0 deletions search/mteb/dbpedia/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
## DBPedia Passage Ranking Track

This track assesses the search performance of the dataset available at [mteb/dbpedia](https://huggingface.co/datasets/mteb/dbpedia).
To compare search performance, the following strategies are employed:
* `default`: This is a straightforward strategy that involves indexing the text fields using a standard analyzer and querying using a `multi_match` query. No custom analysis is used.
* `english-analyzed`: In this strategy, we perform the same test as with `default` but with a basic custom english analyzer applied.


### Example Document

Documents adhere to the [JSON Lines format](https://jsonlines.org/).
When a single document is pretty printed, it takes the following example format:

<details>
<summary><i>Example document</i></summary>

```json
{
"_id": "<dbpedia:Animalia_(book)>",
"title": "Animalia (book)",
"text": "Animalia is an illustrated children's book by Graeme Base. It was originally published in 1986, followed by a tenth anniversary edition in 1996, and a 25th anniversary edition in 2012. Over three million copies have been sold. A special numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket."
}
```
</details>

### Example Query

Queries are structured within a JSON array, where each individual object signifies a unique 'query'.

<details>
<summary><i>Example query object</i></summary>

TODO

</details>

### Parameters
This track accepts the following parameters with Rally 0.8.0+ using `--track-params`:

* `bulk_size` (default: 5000)
* `bulk_indexing_clients` (default: 8)
* `ingest_percentage` (default: 100)
* `number_of_shards` (default: 1)
* `number_of_replicas` (default: 0)
* `search_clients` (default: 1)

### License
Terms and Conditions for using the mteb datasets can be found at https://github.com/embeddings-benchmark/mteb
2 changes: 2 additions & 0 deletions search/mteb/dbpedia/_tools/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pytrec_eval
numpy
80 changes: 80 additions & 0 deletions search/mteb/dbpedia/challenges/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
{
"name": "dbpedia-ranking",
"description": "Performs relevance benchmarking using the mets/dbpedia dataset",
"default": true,
"schedule": [
{
"operation": "delete-index"
},
{
"operation": {
"include-in-reporting": false,
"operation-type": "create-index"
}
},
{
"name": "check-cluster-health",
"operation": {
"include-in-reporting": false,
"operation-type": "cluster-health",
"index": "dbpedia-ranking-collection",
"request-params": {
"wait_for_status": "{{cluster_health | default('yellow')}}",
"wait_for_no_relocating_shards": "true"
},
"retry-until-success": true
}
},
{
"operation": {
"operation-type": "bulk",
"bulk-size": {{bulk_size | default(500)}},
"ingest-percentage": {{ingest_percentage | default(100)}}
},
"clients": {{bulk_indexing_clients | default(8)}}
},
{
"name": "refresh-after-index",
"operation": {
"operation-type": "refresh",
"request-timeout": 1000,
"include-in-reporting": true
}
},
{
"name": "wait-until-merges-finish-after-index",
"operation": {
"operation-type": "index-stats",
"index": "_all",
"condition": {
"path": "_all.total.merges.current",
"expected-value": 0
},
"retry-until-success": true,
"include-in-reporting": false
}
},
{
"name": "default-text-search",
"operation": "default-text-search",
"warmup-iterations": 100,
"iterations": 1000,
"clients": {{search_clients | default(1)}}
},
{
"name": "english-analyzed-text-search",
"operation": "english-analyzed-text-search",
"warmup-iterations": 100,
"iterations": 1000,
"clients": {{search_clients | default(1)}}
},
{
"name": "default-text-search-relevance",
"operation": "default-text-search-relevance"
},
{
"name": "english-analyzed-text-search-relevance",
"operation": "english-analyzed-text-search-relevance"
}
]
}
56 changes: 56 additions & 0 deletions search/mteb/dbpedia/dbpedia-ranking-collection.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"mappings": {
"properties": {
"id": {
"type":"keyword"
},
"title": {
"type": "text",
"copy_to": "english_analyzed_title"
},
"text": {
"type": "text",
"copy_to": "english_analyzed_text"
},
"english_analyzed_title": {
"type": "text",
"analyzer": "default_english_analyzer"
},
"english_analyzed_text": {
"type": "text",
"analyzer": "default_english_analyzer"
}
}
},
"settings": {
"index": {
"number_of_replicas": "{{number_of_replicas | default(0)}}",
"number_of_shards": "{{number_of_shards | default(1)}}"
},
"analysis": {
"filter": {
"english-stem-filter": {
"type": "stemmer",
"language": "light_english"
},
"english-stop-words-filter": {
"type": "stop",
"stopwords": "_english_"
}
},
"analyzer": {
"default_english_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"cjk_width",
"lowercase",
"asciifolding",
"english-stop-words-filter",
"english-stem-filter"
]
}
}
}
}
}
2 changes: 2 additions & 0 deletions search/mteb/dbpedia/files.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
documents-1k.json.bz2
documents.json.bz2
52 changes: 52 additions & 0 deletions search/mteb/dbpedia/operations/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"name": "default-text-search",
"operation-type": "search",
"param-source": "query_param_source",
"query_source": "queries.json",
"size": 100,
"title_field": "title",
"title_boost": 10,
"text_field": "text",
"text_boost": 1,
"track_total_hits": false,
"include-in-reporting": true
},
{
"name": "english-analyzed-text-search",
"operation-type": "search",
"param-source": "query_param_source",
"query_source": "queries.json",
"size": 100,
"title_field": "english_analyzed_title",
"title_boost": 10,
"text_field": "english_analyzed_text",
"text_boost": 1,
"track_total_hits": false,
"include-in-reporting": true
},
{
"name": "default-text-search-relevance",
"operation-type": "text_search_relevance",
"param-source": "relevance_param_source",
"query_source": "queries.json",
"qrels_source": "test.tsv",
"title_field": "title",
"title_boost": 10,
"text_field": "text",
"text_boost": 1,
"size": 100,
"include-in-reporting": true
},
{
"name": "english-analyzed-text-search-relevance",
"operation-type": "text_search_relevance",
"param-source": "relevance_param_source",
"query_source": "queries.json",
"qrels_source": "test.tsv",
"title_field": "english_analyzed_title",
"title_boost": 10,
"text_field": "english_analyzed_text",
"text_boost": 1,
"size": 100,
"include-in-reporting": true
}
1 change: 1 addition & 0 deletions search/mteb/dbpedia/queries.json

Large diffs are not rendered by default.

Loading

0 comments on commit 151f5d8

Please sign in to comment.