forked from querqy/chorus-elasticsearch-edition
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathquickstart.sh
executable file
·319 lines (270 loc) · 10.6 KB
/
quickstart.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/bin/bash -e
# This script starts up Chorus and runs through the basic setup tasks.
# Ansi color code variables
ERROR='\033[0;31m[QUICKSTART] '
MAJOR='\033[0;34m[QUICKSTART] '
MINOR='\033[0;37m[QUICKSTART] '
RESET='\033[0m' # No Color
export DOCKER_SCAN_SUGGEST=false
if ! [ -x "$(command -v curl)" ]; then
echo "${ERROR}Error: curl is not installed.${RESET}" >&2
exit 1
fi
if ! [ -x "$(command -v docker)" ]; then
echo "${ERROR}Error: docker is not installed.${RESET}" >&2
exit 1
fi
if ! [ -x "$(command -v wget)" ]; then
echo "${ERROR}Error: wget is not installed.${RESET}" >&2
exit 1
fi
observability=false
shutdown=false
offline_lab=false
local_deploy=true
stop=false
full_dataset=false
while [ ! $# -eq 0 ]
do
case "$1" in
--help | -h)
echo -e "Use the option --with-offline-lab | -lab to include Quepid service in Chorus."
echo -e "Use the option --shutdown | -s to shutdown and remove the Docker containers and data."
echo -e "Use the option --stop to stop the Docker containers."
echo -e "Use the option --online-deployment | -online to update configuration to run on chorus-opensearch-edition.dev.o19s.com environment."
echo -e "Use the option --full-dataset | -full to index the whole data set. This takes some time depending on your hardware."
exit
;;
--with-offline-lab | -lab)
offline_lab=true
echo -e "${MAJOR}Running Chorus with offline lab environment enabled\n${RESET}"
;;
--shutdown | -s)
shutdown=true
echo -e "${MAJOR}Shutting down Chorus\n${RESET}"
;;
--stop)
stop=true
echo -e "${MAJOR}Stopping Chorus\n${RESET}"
;;
--online-deployment | -online)
local_deploy=false
echo -e "${MAJOR}Configuring Chorus for chorus-opensearch-edition.dev.o19s.com environment\n${RESET}"
;;
--full-dataset | -full)
full_dataset=true
echo -e "${MAJOR}Indexing whole data set\n${RESET}"
;;
esac
shift
done
services="opensearch opensearch-dashboards middleware reactivesearch"
if $offline_lab; then
services="${services} quepid"
fi
if ! $local_deploy; then
echo -e "${MAJOR}Updating configuration files for online deploy${RESET}"
sed -i.bu 's/localhost/chorus-opensearch-edition.dev.o19s.com/g' ./reactivesearch/src/App.js
sed -i.bu 's/localhost/chorus-opensearch-edition.dev.o19s.com/g' ./opensearch/wait-for-os.sh
fi
if $stop; then
docker compose stop ${services}
exit
fi
if $shutdown; then
docker compose down -v
exit
fi
docker compose up -d --build ${services}
echo -e "${MAJOR}Waiting for OpenSearch to start up and be online.${RESET}"
./opensearch/wait-for-os.sh # Wait for OpenSearch to be online
echo -e "${MAJOR}Configuring the ML Commons plugin.${RESET}"
curl -s -X PUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' --data-binary '{
"persistent": {
"plugins": {
"ml_commons": {
"only_run_on_ml_node": "false",
"model_access_control_enabled": "true",
"native_memory_threshold": "99"
}
}
}
}'
echo -e "${MAJOR}Registering a model group.${RESET}"
response=$(curl -s -X POST "http://localhost:9200/_plugins/_ml/model_groups/_register" \
-H 'Content-Type: application/json' \
--data-binary '{
"name": "neural_search_model_group",
"description": "A model group for neural search models"
}')
# Extract the model_group_id from the JSON response
model_group_id=$(echo "$response" | jq -r '.model_group_id')
# Use the extracted model_group_id
echo "Created Model Group with id: $model_group_id"
echo -e "${MAJOR}Registering a model in the model group.${RESET}"
response=$(curl -s -X POST "http://localhost:9200/_plugins/_ml/models/_register" \
-H 'Content-Type: application/json' \
--data-binary "{
\"name\": \"huggingface/sentence-transformers/all-MiniLM-L6-v2\",
\"version\": \"1.0.1\",
\"model_group_id\": \"$model_group_id\",
\"model_format\": \"TORCH_SCRIPT\"
}")
# Extract the task_id from the JSON response
task_id=$(echo "$response" | jq -r '.task_id')
# Use the extracted task_id
echo "Created Model, get status with task id: $task_id"
echo -e "${MAJOR}Waiting for the model to be registered.${RESET}"
max_attempts=10
attempts=0
# Wait for task to be COMPLETED
while [[ "$(curl -s localhost:9200/_plugins/_ml/tasks/$task_id | jq -r '.state')" != "COMPLETED" && $attempts -lt $max_attempts ]]; do
echo "Waiting for task to complete... attempt $((attempts + 1))/$max_attempts"
sleep 5
attempts=$((attempts + 1))
done
if [[ $attempts -ge $max_attempts ]]; then
echo "Limit of attempts reached. Something went wrong with registering the model. Check OpenSearch logs."
exit 1
else
response=$(curl -s localhost:9200/_plugins/_ml/tasks/$task_id)
model_id=$(echo "$response" | jq -r '.model_id')
echo "Task completed successfully! Model registered with id: $model_id"
fi
echo -e "${MAJOR}Deploying the model.${RESET}"
response=$(curl -s -X POST "http://localhost:9200/_plugins/_ml/models/$model_id/_deploy")
# Extract the task_id from the JSON response
deploy_task_id=$(echo "$response" | jq -r '.task_id')
echo "Model deployment started, get status with task id: $deploy_task_id"
echo -e "${MAJOR}Waiting for the model to be deployed.${RESET}"
# Reset attempts
attempts=0
while [[ "$(curl -s localhost:9200/_plugins/_ml/tasks/$task_id | jq -r '.state')" != "COMPLETED" && $attempts -lt $max_attempts ]]; do
echo "Waiting for task to complete... attempt $((attempts + 1))/$max_attempts"
sleep 5
attempts=$((attempts + 1))
done
if [[ $attempts -ge $max_attempts ]]; then
echo "Limit of attempts reached. Something went wrong with deploying the model. Check OpenSearch logs."
else
response=$(curl -s localhost:9200/_plugins/_ml/tasks/$task_id)
model_id=$(echo "$response" | jq -r '.model_id')
echo "Task completed successfully! Model deployed with id: $model_id"
fi
echo -e "${MAJOR}Creating an ingest pipeline for embedding generation during index time.${RESET}"
curl -s -X PUT "http://localhost:9200/_ingest/pipeline/embeddings-pipeline" \
-H 'Content-Type: application/json' \
--data-binary "{
\"description\": \"A text embedding pipeline\",
\"processors\": [
{
\"text_embedding\": {
\"model_id\": \"$model_id\",
\"field_map\": {
\"title\": \"title_embedding\"
}
}
}
]
}"
echo -e "${MAJOR}Setting up User Behavior Insights indexes...\n${RESET}"
curl -s -X POST "http://localhost:9200/_plugins/ubi/initialize"
echo -e "${MAJOR}Creating ecommerce index, defining its mapping & settings\n${RESET}"
curl -s -X PUT "http://localhost:9200/ecommerce" -H 'Content-Type: application/json' --data-binary @./opensearch/schema.json
echo -e "\n"
echo -e "${MAJOR}Prepping Data for Ingestion\n${RESET}"
if [ ! -f ./esci.json.zst ]; then
echo -e "${MINOR}Downloading the sample product data\n${RESET}"
wget https://esci-s.s3.amazonaws.com/esci.json.zst
fi
if [ ! -f ./transformed_esci_1.json ]; then
echo -e "${MINOR}Transforming the sample product data into JSON format, please give it a few minutes!\n${RESET}"
docker run -v "$(pwd)":/app -w /app python:3 bash -c "pip install -r requirements.txt && python3 ./opensearch/transform_data.py"
fi
echo -e "${MAJOR}Indexing the product data, please wait...\n${RESET}"
# Define the OpenSearch endpoint and content header
OPENSEARCH_URL="http://localhost:9200/ecommerce/_bulk?pretty=false&filter_path=-items"
CONTENT_TYPE="Content-Type: application/json"
# Loop through each JSON file with the prefix "transformed_esci_"
for file in transformed_esci_*.json; do
if [[ -f "$file" ]]; then
if [[ "$file" == "transformed_esci_11.json" && "$full_dataset" == "false" ]]; then
echo "Indexing subset of full data set, exiting loop. Run quickstart.sh with --full-dataset option to index whole data set."
break
fi
echo "Processing $file..."
# Send the file to OpenSearch using curl
curl -X POST "$OPENSEARCH_URL" -H "$CONTENT_TYPE" --data-binary @"$file"
# Check the response code to see if the request was successful
if [[ $? -ne 0 ]]; then
echo "Failed to send $file"
else
echo "$file successfully sent to OpenSearch"
fi
else
echo "No files found with the prefix 'transformed_esci_'"
fi
done
echo -e "${MAJOR}Creating pipelines for neural search and hybrid search\n${RESET}"
curl -s -X PUT "http://localhost:9200/_search/pipeline/neural-search-pipeline" \
-H 'Content-Type: application/json' \
--data-binary "{
\"description\": \"Neural Only Search\",
\"request_processors\": [
{
\"neural_query_enricher\" : {
\"description\": \"Sets the default model ID at index and field levels\",
\"default_model_id\": \"$model_id\",
\"neural_field_default_id\": {
\"title_embeddings\": \"$model_id\"
}
}
}
]
}"
curl -s -X PUT "http://localhost:9200/_search/pipeline/hybrid-search-pipeline" \
-H 'Content-Type: application/json' \
--data-binary "{
\"request_processors\": [
{
\"neural_query_enricher\" : {
\"description\": \"Sets the default model ID at index and field levels\",
\"default_model_id\": \"$model_id\",
\"neural_field_default_id\": {
\"title_embeddings\": \"$model_id\"
}
}
}
],
\"phase_results_processors\": [
{
\"normalization-processor\": {
\"normalization\": {
\"technique\": \"min_max\"
},
\"combination\": {
\"technique\": \"arithmetic_mean\",
\"parameters\": {
\"weights\": [
0.3,
0.7
]
}
}
}
}
]
}"
if $offline_lab; then
echo -e "${MAJOR}Setting up Quepid${RESET}"
docker compose run --rm quepid bundle exec bin/rake db:setup
docker compose run quepid bundle exec thor user:create -a [email protected] "Chorus Admin" password
fi
echo -e "${MAJOR}Updating the indexed data with embeddings...\n${RESET}"
update_docs_task_id=$(curl -s -X POST "http://localhost:9200/ecommerce/_update_by_query?pipeline=embeddings-pipeline&wait_for_completion=false" | jq -r '.task')
echo -e "${MAJOR}This process runs in the background. Plese give it a couple of minutes. You can check the progress with the following curl command:
curl -s GET http://localhost:9200/_tasks/$update_docs_task_id\n${RESET}"
# we start dataprepper as the last service to prevent it from creating the ubi_queries index using the wrong mappings.
echo -e "${MAJOR}Starting Dataprepper...\n${RESET}"
docker compose up -d --build dataprepper
echo -e "${MAJOR}Welcome to Chorus OpenSearch Edition!${RESET}"