From 3a5541d83c1ec20ce3851ec079fadee067794c13 Mon Sep 17 00:00:00 2001
From: Kartik Choudhary <kartikchoudhary9@gmail.com>
Date: Tue, 17 Oct 2023 14:00:14 -0400
Subject: [PATCH] Improvements in HuggingFA QA example notebook (#2391)

* Added info about required packages

* Update responsibleaidashboard-question-answering-model-debugging.ipynb

* show example prediction

* Update responsibleaidashboard-question-answering-model-debugging.ipynb
---
 ...d-question-answering-model-debugging.ipynb | 85 +++++++++++++------
 1 file changed, 58 insertions(+), 27 deletions(-)

diff --git a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
index 3b663cfc61..dbcb6b8dc9 100644
--- a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
+++ b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
@@ -42,6 +42,31 @@
     "The following section examines the code necessary to create datasets and a model. It then generates insights using the `responsibleai` API that can be visually analyzed."
    ]
   },
+{
+   "cell_type": "markdown",
+   "id": "6174bcad",
+   "metadata": {},
+   "source": [
+    "### Prepare\n",
+    "\n",
+    "To run this notebook, we need to install the following packages:\n",
+    "\n",
+    "```\n",
+    "raiutils\n",
+    "raiwidgets\n",
+    "datasets\n",
+    "transformers\n",
+    "responsibleai_text\n",
+    "torch\n",
+    "```\n",
+    "\n",
+    "Run the following command to load the spacy pipeline:\n",
+    "\n",
+    "```bash\n",
+    "python -m spacy download en_core_web_sm\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "40739025",
@@ -86,16 +111,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = datasets.load_dataset(\"squad\", split=\"train\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a0eef443",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "dataset = datasets.load_dataset(\"squad\", split=\"train\")\n",
     "dataset"
    ]
   },
@@ -130,17 +146,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = pd.DataFrame({'context': context, 'questions': questions, 'answers': answers})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e6f87e9c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data"
+    "data = pd.DataFrame({'context': context, 'questions': questions, 'answers': answers})\n",
+    "data = data.sample(frac=1.0, random_state=42).reset_index(drop=True)\n",
+    "data.head()"
    ]
   },
   {
@@ -159,18 +167,42 @@
    "outputs": [],
    "source": [
     "# load the question-answering model\n",
-    "pmodel = pipeline('question-answering')"
+    "pipeline_model = pipeline('question-answering')\n",
+    "test_size = 5\n",
+    "\n",
+    "train_data = data\n",
+    "test_data = data[:test_size]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7cf8327b",
+   "metadata": {},
+   "source": [
+    "See an example of the model's predictions"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "04801887",
+   "id": "ce087699",
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_data = data\n",
-    "test_data = data[:5]"
+    "def get_answer(dataset, idx):\n",
+    "    model_output = pipeline_model(question=dataset['questions'][idx], \n",
+    "                                  context=dataset['context'][idx])\n",
+    "    pred = model_output['answer']\n",
+    "    return pred\n",
+    "\n",
+    "def check_answer(dataset, idx):\n",
+    "    pred = get_answer(dataset, idx)\n",
+    "    print('Question  : ', dataset['questions'][idx])\n",
+    "    print('Answer    : ', dataset['answers'][idx])\n",
+    "    print('Predicted : ', pred)\n",
+    "    print('Correct   : ', pred == dataset['answers'][idx])\n",
+    "\n",
+    "check_answer(test_data, 0)\n"
    ]
   },
   {
@@ -209,8 +241,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "rai_insights = RAITextInsights(pmodel, test_data,\n",
-    "                               \"answers\",\n",
+    "rai_insights = RAITextInsights(pipeline_model, test_data, \"answers\",\n",
     "                               task_type=ModelTask.QUESTION_ANSWERING)"
    ]
   },