-
Notifications
You must be signed in to change notification settings - Fork 49
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create notebook file with default ingest script
Take the default script being used for ingest and create a notebook file so customers can more easily see the script and make edits. For detail refer @jon's refactoring doc Sycamore Deployment - Containers https://www.notion.so/Sycamore-Deployment-Containers-91c4f7602b984e9991642f5cc8193ba9?pvs=4
- Loading branch information
1 parent
b6510fc
commit 830f202
Showing
1 changed file
with
275 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,275 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "e3736f91-5695-4beb-8591-227eb444adc0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from pathlib import Path\n", | ||
"import datetime\n", | ||
"import math\n", | ||
"import numpy\n", | ||
"import os\n", | ||
"import requests\n", | ||
"import stat\n", | ||
"import sys\n", | ||
"import time\n", | ||
"\n", | ||
"import sycamore\n", | ||
"from sycamore.functions import HuggingFaceTokenizer, TextOverlapChunker\n", | ||
"from sycamore.llms import OpenAI, OpenAIModels\n", | ||
"from sycamore.transforms.embed import SentenceTransformerEmbedder\n", | ||
"from sycamore.transforms.extract_entity import OpenAIEntityExtractor\n", | ||
"from sycamore.transforms.merge_elements import GreedyTextElementMerger\n", | ||
"from sycamore.transforms.partition import UnstructuredPdfPartitioner, HtmlPartitioner\n", | ||
"\n", | ||
"index = \"demoindex0\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "04cb55d4-3ad7-4003-bc09-6ea1b1ca4432", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%%bash\n", | ||
"pwd\n", | ||
"sudo mkdir -p /app/work/crawl_data/downloads/pdf\n", | ||
"sudo mkdir -p /app/work/crawl_data/downloads/html\n", | ||
"sudo curl https://sortbenchmark.org/ELSAR2022.pdf -o /app/work/crawl_data/downloads/pdf/elsar.pdf\n", | ||
"sudo curl https://sortbenchmark.org -o /app/work/crawl_data/downloads/html/home.html" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "1fbc683e-00e9-4c25-9ce0-e284584192e4", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_os_client_args():\n", | ||
" args = {\n", | ||
" \"hosts\": [{\"host\": \"opensearch\", \"port\": 9200}],\n", | ||
" \"http_compress\": True,\n", | ||
" \"http_auth\": (\"admin\", \"admin\"),\n", | ||
" \"use_ssl\": False,\n", | ||
" \"verify_certs\": False,\n", | ||
" \"ssl_assert_hostname\": False,\n", | ||
" \"ssl_show_warn\": False,\n", | ||
" \"timeout\": 120,\n", | ||
" }\n", | ||
" return args\n", | ||
"\n", | ||
"def get_index_settings():\n", | ||
" return {\n", | ||
" \"body\": {\n", | ||
" \"settings\": {\"index.knn\": True, \"number_of_shards\": 5, \"number_of_replicas\": 1},\n", | ||
" \"mappings\": {\n", | ||
" \"properties\": {\n", | ||
" \"text\": {\"type\": \"text\"},\n", | ||
" \"embedding\": {\n", | ||
" \"dimension\": 384,\n", | ||
" \"method\": {\"engine\": \"nmslib\", \"space_type\": \"l2\", \"name\": \"hnsw\", \"parameters\": {}},\n", | ||
" \"type\": \"knn_vector\",\n", | ||
" },\n", | ||
" \"title\": {\"type\": \"text\"},\n", | ||
" \"searchable_text\": {\"type\": \"text\"},\n", | ||
" \"title_embedding\": {\n", | ||
" \"dimension\": 384,\n", | ||
" \"method\": {\"engine\": \"nmslib\", \"space_type\": \"l2\", \"name\": \"hnsw\", \"parameters\": {}},\n", | ||
" \"type\": \"knn_vector\",\n", | ||
" },\n", | ||
" \"url\": {\"type\": \"text\"},\n", | ||
" }\n", | ||
" },\n", | ||
" }\n", | ||
" }" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "95ab0a44-9b94-45f1-a892-d4ffa50931f6", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_title_context_template():\n", | ||
" # ruff: noqa: E501\n", | ||
" return \"\"\"\n", | ||
" ELEMENT 1: Jupiter's Moons\n", | ||
" ELEMENT 2: Ganymede 2020\n", | ||
" ELEMENT 3: by Audi Lauper and Serena K. Goldberg. 2011\n", | ||
" ELEMENT 4: From Wikipedia, the free encyclopedia\n", | ||
" ELEMENT 5: Ganymede, or Jupiter III, is the largest and most massive natural satellite of Jupiter as well as in the Solar System, being a planetary-mass moon. It is the largest Solar System object without an atmosphere, despite being the only moon of the Solar System with a magnetic field. Like Titan, it is larger than the planet Mercury, but has somewhat less surface gravity than Mercury, Io or the Moon.\n", | ||
" =========\n", | ||
" \"Ganymede 2020\"\n", | ||
"\n", | ||
" ELEMENT 1: FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation\n", | ||
" ELEMENT 2: Tarun Kalluri * UCSD\n", | ||
" ELEMENT 3: Deepak Pathak CMU\n", | ||
" ELEMENT 4: Manmohan Chandraker UCSD\n", | ||
" ELEMENT 5: Du Tran Facebook AI\n", | ||
" ELEMENT 6: https://tarun005.github.io/FLAVR/\n", | ||
" ELEMENT 7: 2 2 0 2\n", | ||
" ELEMENT 8: b e F 4 2\n", | ||
" ELEMENT 9: ]\n", | ||
" ELEMENT 10: V C . s c [\n", | ||
" ========\n", | ||
" \"FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation\"\n", | ||
"\n", | ||
" \"\"\"\n", | ||
"\n", | ||
"def get_author_context_template():\n", | ||
" # ruff: noqa: E501\n", | ||
" return \"\"\"\n", | ||
" ELEMENT 1: Jupiter's Moons\n", | ||
" ELEMENT 2: Ganymede 2020\n", | ||
" ELEMENT 3: by Audi Lauper and Serena K. Goldberg. 2011\n", | ||
" ELEMENT 4: From Wikipedia, the free encyclopedia\n", | ||
" ELEMENT 5: Ganymede, or Jupiter III, is the largest and most massive natural satellite of Jupiter as well as in the Solar System, being a planetary-mass moon. It is the largest Solar System object without an atmosphere, despite being the only moon of the Solar System with a magnetic field. Like Titan, it is larger than the planet Mercury, but has somewhat less surface gravity than Mercury, Io or the Moon.\n", | ||
" =========\n", | ||
" Audi Laupe, Serena K. Goldberg\n", | ||
"\n", | ||
" ELEMENT 1: FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation\n", | ||
" ELEMENT 2: Tarun Kalluri * UCSD\n", | ||
" ELEMENT 3: Deepak Pathak CMU\n", | ||
" ELEMENT 4: Manmohan Chandraker UCSD\n", | ||
" ELEMENT 5: Du Tran Facebook AI\n", | ||
" ELEMENT 6: https://tarun005.github.io/FLAVR/\n", | ||
" ELEMENT 7: 2 2 0 2\n", | ||
" ELEMENT 8: b e F 4 2\n", | ||
" ELEMENT 9: ]\n", | ||
" ELEMENT 10: V C . s c [\n", | ||
" ========\n", | ||
" Tarun Kalluri, Deepak Pathak, Manmohan Chandraker, Du Tran\n", | ||
"\n", | ||
" \"\"\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "26836da3-4b96-412f-83e3-92bbe73ad5e3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def import_pdf(paths):\n", | ||
" if len(paths) == 0:\n", | ||
" print(\"WARNING: import_pdf called with empty paths\")\n", | ||
" return\n", | ||
"\n", | ||
" openai_llm = OpenAI(OpenAIModels.GPT_3_5_TURBO_INSTRUCT.value)\n", | ||
" tokenizer = HuggingFaceTokenizer(\"sentence-transformers/all-MiniLM-L6-v2\")\n", | ||
" merger = GreedyTextElementMerger(tokenizer, 256)\n", | ||
"\n", | ||
" ctx = sycamore.init()\n", | ||
" (\n", | ||
" ctx.read.binary(paths, binary_format=\"pdf\", filter_paths_by_extension=False)\n", | ||
" .partition(\n", | ||
" partitioner=UnstructuredPdfPartitioner(),\n", | ||
" )\n", | ||
" .merge(merger)\n", | ||
" .extract_entity(\n", | ||
" entity_extractor=OpenAIEntityExtractor(\n", | ||
" \"title\", llm=openai_llm, prompt_template=get_title_context_template()\n", | ||
" )\n", | ||
" )\n", | ||
" .extract_entity(\n", | ||
" entity_extractor=OpenAIEntityExtractor(\n", | ||
" \"authors\", llm=openai_llm, prompt_template=get_author_context_template()\n", | ||
" )\n", | ||
" )\n", | ||
" .spread_properties([\"path\", \"title\"])\n", | ||
" .explode()\n", | ||
" .embed(\n", | ||
" embedder=SentenceTransformerEmbedder(batch_size=100, model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", | ||
" )\n", | ||
" .write.opensearch(os_client_args=get_os_client_args(), index_name=index, index_settings=get_index_settings())\n", | ||
" )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "71fff973-09a7-46fe-b8d1-cc82d46e1ec0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def import_html(paths):\n", | ||
" if len(paths) == 0:\n", | ||
" print(\"WARNING: import_html called with empty paths\")\n", | ||
" return\n", | ||
"\n", | ||
" ctx = sycamore.init()\n", | ||
" (\n", | ||
" ctx.read.binary(paths, binary_format=\"html\", filter_paths_by_extension=False)\n", | ||
" .partition(\n", | ||
" partitioner=HtmlPartitioner(\n", | ||
" extract_tables=True,\n", | ||
" text_chunker=TextOverlapChunker(chunk_token_count=4000, chunk_overlap_token_count=400),\n", | ||
" )\n", | ||
" )\n", | ||
" .spread_properties([\"path\", \"title\"])\n", | ||
" .explode()\n", | ||
" .embed(\n", | ||
" embedder=SentenceTransformerEmbedder(batch_size=100, model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", | ||
" )\n", | ||
" .write.opensearch(os_client_args=get_os_client_args(), index_name=index, index_settings=get_index_settings())\n", | ||
" )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a259fa57-88ce-4260-96b4-11c55efedc25", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import_pdf(\"/app/work/crawl_data/downloads/pdf\")\n", | ||
"import_html(\"/app/work/crawl_data/downloads/html\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "73a42faf-62f7-4816-ae2e-6844c1224730", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"print(\"Visit http://localhost:3000 and use the\", index, \" index to query these results in the UI\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2f2aed0b-0b96-4697-98b9-ff0131328540", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |