Skip to content

Commit

Permalink
Create notebook file with default ingest script
Browse files Browse the repository at this point in the history
Take the default script being used for ingest and create a notebook
file so customers can more easily see the script and make edits.
For detail refer @jon's refactoring doc Sycamore Deployment - Containers
https://www.notion.so/Sycamore-Deployment-Containers-91c4f7602b984e9991642f5cc8193ba9?pvs=4
  • Loading branch information
bohou-aryn committed Feb 8, 2024
1 parent b6510fc commit 830f202
Showing 1 changed file with 275 additions and 0 deletions.
275 changes: 275 additions & 0 deletions notebooks/default-prep-script.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "e3736f91-5695-4beb-8591-227eb444adc0",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import datetime\n",
"import math\n",
"import numpy\n",
"import os\n",
"import requests\n",
"import stat\n",
"import sys\n",
"import time\n",
"\n",
"import sycamore\n",
"from sycamore.functions import HuggingFaceTokenizer, TextOverlapChunker\n",
"from sycamore.llms import OpenAI, OpenAIModels\n",
"from sycamore.transforms.embed import SentenceTransformerEmbedder\n",
"from sycamore.transforms.extract_entity import OpenAIEntityExtractor\n",
"from sycamore.transforms.merge_elements import GreedyTextElementMerger\n",
"from sycamore.transforms.partition import UnstructuredPdfPartitioner, HtmlPartitioner\n",
"\n",
"index = \"demoindex0\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "04cb55d4-3ad7-4003-bc09-6ea1b1ca4432",
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"pwd\n",
"sudo mkdir -p /app/work/crawl_data/downloads/pdf\n",
"sudo mkdir -p /app/work/crawl_data/downloads/html\n",
"sudo curl https://sortbenchmark.org/ELSAR2022.pdf -o /app/work/crawl_data/downloads/pdf/elsar.pdf\n",
"sudo curl https://sortbenchmark.org -o /app/work/crawl_data/downloads/html/home.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1fbc683e-00e9-4c25-9ce0-e284584192e4",
"metadata": {},
"outputs": [],
"source": [
"def get_os_client_args():\n",
" args = {\n",
" \"hosts\": [{\"host\": \"opensearch\", \"port\": 9200}],\n",
" \"http_compress\": True,\n",
" \"http_auth\": (\"admin\", \"admin\"),\n",
" \"use_ssl\": False,\n",
" \"verify_certs\": False,\n",
" \"ssl_assert_hostname\": False,\n",
" \"ssl_show_warn\": False,\n",
" \"timeout\": 120,\n",
" }\n",
" return args\n",
"\n",
"def get_index_settings():\n",
" return {\n",
" \"body\": {\n",
" \"settings\": {\"index.knn\": True, \"number_of_shards\": 5, \"number_of_replicas\": 1},\n",
" \"mappings\": {\n",
" \"properties\": {\n",
" \"text\": {\"type\": \"text\"},\n",
" \"embedding\": {\n",
" \"dimension\": 384,\n",
" \"method\": {\"engine\": \"nmslib\", \"space_type\": \"l2\", \"name\": \"hnsw\", \"parameters\": {}},\n",
" \"type\": \"knn_vector\",\n",
" },\n",
" \"title\": {\"type\": \"text\"},\n",
" \"searchable_text\": {\"type\": \"text\"},\n",
" \"title_embedding\": {\n",
" \"dimension\": 384,\n",
" \"method\": {\"engine\": \"nmslib\", \"space_type\": \"l2\", \"name\": \"hnsw\", \"parameters\": {}},\n",
" \"type\": \"knn_vector\",\n",
" },\n",
" \"url\": {\"type\": \"text\"},\n",
" }\n",
" },\n",
" }\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95ab0a44-9b94-45f1-a892-d4ffa50931f6",
"metadata": {},
"outputs": [],
"source": [
"def get_title_context_template():\n",
" # ruff: noqa: E501\n",
" return \"\"\"\n",
" ELEMENT 1: Jupiter's Moons\n",
" ELEMENT 2: Ganymede 2020\n",
" ELEMENT 3: by Audi Lauper and Serena K. Goldberg. 2011\n",
" ELEMENT 4: From Wikipedia, the free encyclopedia\n",
" ELEMENT 5: Ganymede, or Jupiter III, is the largest and most massive natural satellite of Jupiter as well as in the Solar System, being a planetary-mass moon. It is the largest Solar System object without an atmosphere, despite being the only moon of the Solar System with a magnetic field. Like Titan, it is larger than the planet Mercury, but has somewhat less surface gravity than Mercury, Io or the Moon.\n",
" =========\n",
" \"Ganymede 2020\"\n",
"\n",
" ELEMENT 1: FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation\n",
" ELEMENT 2: Tarun Kalluri * UCSD\n",
" ELEMENT 3: Deepak Pathak CMU\n",
" ELEMENT 4: Manmohan Chandraker UCSD\n",
" ELEMENT 5: Du Tran Facebook AI\n",
" ELEMENT 6: https://tarun005.github.io/FLAVR/\n",
" ELEMENT 7: 2 2 0 2\n",
" ELEMENT 8: b e F 4 2\n",
" ELEMENT 9: ]\n",
" ELEMENT 10: V C . s c [\n",
" ========\n",
" \"FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation\"\n",
"\n",
" \"\"\"\n",
"\n",
"def get_author_context_template():\n",
" # ruff: noqa: E501\n",
" return \"\"\"\n",
" ELEMENT 1: Jupiter's Moons\n",
" ELEMENT 2: Ganymede 2020\n",
" ELEMENT 3: by Audi Lauper and Serena K. Goldberg. 2011\n",
" ELEMENT 4: From Wikipedia, the free encyclopedia\n",
" ELEMENT 5: Ganymede, or Jupiter III, is the largest and most massive natural satellite of Jupiter as well as in the Solar System, being a planetary-mass moon. It is the largest Solar System object without an atmosphere, despite being the only moon of the Solar System with a magnetic field. Like Titan, it is larger than the planet Mercury, but has somewhat less surface gravity than Mercury, Io or the Moon.\n",
" =========\n",
" Audi Laupe, Serena K. Goldberg\n",
"\n",
" ELEMENT 1: FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation\n",
" ELEMENT 2: Tarun Kalluri * UCSD\n",
" ELEMENT 3: Deepak Pathak CMU\n",
" ELEMENT 4: Manmohan Chandraker UCSD\n",
" ELEMENT 5: Du Tran Facebook AI\n",
" ELEMENT 6: https://tarun005.github.io/FLAVR/\n",
" ELEMENT 7: 2 2 0 2\n",
" ELEMENT 8: b e F 4 2\n",
" ELEMENT 9: ]\n",
" ELEMENT 10: V C . s c [\n",
" ========\n",
" Tarun Kalluri, Deepak Pathak, Manmohan Chandraker, Du Tran\n",
"\n",
" \"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26836da3-4b96-412f-83e3-92bbe73ad5e3",
"metadata": {},
"outputs": [],
"source": [
"def import_pdf(paths):\n",
" if len(paths) == 0:\n",
" print(\"WARNING: import_pdf called with empty paths\")\n",
" return\n",
"\n",
" openai_llm = OpenAI(OpenAIModels.GPT_3_5_TURBO_INSTRUCT.value)\n",
" tokenizer = HuggingFaceTokenizer(\"sentence-transformers/all-MiniLM-L6-v2\")\n",
" merger = GreedyTextElementMerger(tokenizer, 256)\n",
"\n",
" ctx = sycamore.init()\n",
" (\n",
" ctx.read.binary(paths, binary_format=\"pdf\", filter_paths_by_extension=False)\n",
" .partition(\n",
" partitioner=UnstructuredPdfPartitioner(),\n",
" )\n",
" .merge(merger)\n",
" .extract_entity(\n",
" entity_extractor=OpenAIEntityExtractor(\n",
" \"title\", llm=openai_llm, prompt_template=get_title_context_template()\n",
" )\n",
" )\n",
" .extract_entity(\n",
" entity_extractor=OpenAIEntityExtractor(\n",
" \"authors\", llm=openai_llm, prompt_template=get_author_context_template()\n",
" )\n",
" )\n",
" .spread_properties([\"path\", \"title\"])\n",
" .explode()\n",
" .embed(\n",
" embedder=SentenceTransformerEmbedder(batch_size=100, model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
" )\n",
" .write.opensearch(os_client_args=get_os_client_args(), index_name=index, index_settings=get_index_settings())\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71fff973-09a7-46fe-b8d1-cc82d46e1ec0",
"metadata": {},
"outputs": [],
"source": [
"def import_html(paths):\n",
" if len(paths) == 0:\n",
" print(\"WARNING: import_html called with empty paths\")\n",
" return\n",
"\n",
" ctx = sycamore.init()\n",
" (\n",
" ctx.read.binary(paths, binary_format=\"html\", filter_paths_by_extension=False)\n",
" .partition(\n",
" partitioner=HtmlPartitioner(\n",
" extract_tables=True,\n",
" text_chunker=TextOverlapChunker(chunk_token_count=4000, chunk_overlap_token_count=400),\n",
" )\n",
" )\n",
" .spread_properties([\"path\", \"title\"])\n",
" .explode()\n",
" .embed(\n",
" embedder=SentenceTransformerEmbedder(batch_size=100, model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
" )\n",
" .write.opensearch(os_client_args=get_os_client_args(), index_name=index, index_settings=get_index_settings())\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a259fa57-88ce-4260-96b4-11c55efedc25",
"metadata": {},
"outputs": [],
"source": [
"import_pdf(\"/app/work/crawl_data/downloads/pdf\")\n",
"import_html(\"/app/work/crawl_data/downloads/html\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73a42faf-62f7-4816-ae2e-6844c1224730",
"metadata": {},
"outputs": [],
"source": [
"print(\"Visit http://localhost:3000 and use the\", index, \" index to query these results in the UI\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2f2aed0b-0b96-4697-98b9-ff0131328540",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 830f202

Please sign in to comment.