From c6a0824e32d74cb00b1019e51ac1f64f9d8695b6 Mon Sep 17 00:00:00 2001 From: Eric Pugh Date: Sat, 17 Feb 2024 11:35:19 -0500 Subject: [PATCH] Still need some work, but at least got them in --- README.md | 2 +- jupyterlite/files/README.md | 3 + jupyterlite/files/examples/Fleiss Kappa.ipynb | 160 +++++ .../examples/Multiple Raters Analysis.ipynb | 588 ++++++++++++++++++ 4 files changed, 752 insertions(+), 1 deletion(-) create mode 100644 jupyterlite/files/examples/Fleiss Kappa.ipynb create mode 100644 jupyterlite/files/examples/Multiple Raters Analysis.ipynb diff --git a/README.md b/README.md index 778c9cb..3964c5d 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Browse to http://localhost:8000 and you should see the Jupyterlite interface. ## Development 2 -1. Run the docker task, and make the jupyter-lite-build.tgz. +1. Run `docker run -it --rm -e TARGET_DIR=/dist -v "$(pwd)":/dist $(docker build -q .)` producing the jupyter-lite-build.tgz. 1. Unzip it into the ./notebooks 1. `rm -rf public/notebooks` in Quepid 1. Make sure Quepid's docker-compose.override.yml has a line similar to `- /Users/epugh/Documents/projects/quepid-jupyterlite/notebooks:/srv/app/public/notebooks` diff --git a/jupyterlite/files/README.md b/jupyterlite/files/README.md index cbc3a77..f01c1b7 100644 --- a/jupyterlite/files/README.md +++ b/jupyterlite/files/README.md @@ -7,6 +7,9 @@ The example notebooks are stored under ./examples. Feel free to run them, but p * `./examples/Scoring Comparison.ipynb` is an example of measuring relevance score change. * `./examples/Jaccard and RBO Comparison.ipynb` is an example of comparing query result sets to each other. +* `./examples/Multiple Raters Analysis.ipynb` looks at how judge compare in rating. +* `./examples/Fleiss Kappa.ipynb` calculates a specific measurement of rater agreement. + These notebooks use data from the Haystack Rating Party. diff --git a/jupyterlite/files/examples/Fleiss Kappa.ipynb b/jupyterlite/files/examples/Fleiss Kappa.ipynb new file mode 100644 index 0000000..e9df73b --- /dev/null +++ b/jupyterlite/files/examples/Fleiss Kappa.ipynb @@ -0,0 +1,160 @@ +{ + "metadata": { + "kernelspec": { + "name": "python", + "display_name": "Python (Pyodide)", + "language": "python" + }, + "language_info": { + "codemirror_mode": { + "name": "python", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8" + } + }, + "nbformat_minor": 5, + "nbformat": 4, + "cells": [ + { + "cell_type": "markdown", + "source": "# Fleiss' Kappa \nTo understand how much your raters what? Scott, need some text!\n\nPlease copy this example and customize it for your own purposes!", + "metadata": {}, + "id": "bd7e4efa-eb00-451e-984d-ed6646d8e25f" + }, + { + "cell_type": "markdown", + "source": "## Imports", + "metadata": {}, + "id": "e3412382" + }, + { + "cell_type": "code", + "source": "import pandas as pd\nfrom js import fetch\nimport json\n\nfrom collections import defaultdict\nfrom statsmodels.stats.inter_rater import aggregate_raters\nfrom statsmodels.stats.inter_rater import fleiss_kappa\nfrom IPython.display import display, Markdown", + "metadata": { + "trusted": true + }, + "execution_count": 1, + "outputs": [], + "id": "4972936a" + }, + { + "cell_type": "markdown", + "source": "## Step 0: Configuration", + "metadata": {}, + "id": "6da26c5e" + }, + { + "cell_type": "code", + "source": "QUEPID_BOOK_NUM = 25\n\n# Not needed if running within Quepid JupyterLite\n# QUEPID_API_TOKEN = \"\"", + "metadata": { + "trusted": true + }, + "execution_count": 3, + "outputs": [], + "id": "71803a49-4065-4adf-a69e-cb0fe2d00f22" + }, + { + "cell_type": "markdown", + "source": "## Step 1: Download the Quepid Book", + "metadata": {}, + "id": "420416df-9e6a-41b4-987b-7a03c9dd38b3" + }, + { + "cell_type": "code", + "source": "# Generic GET call to a JSON endpoint \nasync def get_json(url):\n resp = await fetch(url)\n resp_text = await resp.text()\n return json.loads(resp_text)\n\n", + "metadata": { + "trusted": true + }, + "execution_count": 4, + "outputs": [], + "id": "31193536-98eb-4b46-ab98-af04ee07c6d3" + }, + { + "cell_type": "code", + "source": "data = await get_json(f'/api/export/books/{QUEPID_BOOK_NUM}')", + "metadata": { + "trusted": true + }, + "execution_count": 5, + "outputs": [], + "id": "8fef6231-daa8-467f-ac57-13a144e8a356" + }, + { + "cell_type": "markdown", + "source": "## Step 2: Extract and Prepare Data", + "metadata": {}, + "id": "79d985ad-cd11-44a9-a7e1-0851bc99aef3" + }, + { + "cell_type": "code", + "source": "# Initialize a list to hold the tuples of (doc_id, rating, count)\nratings_data = []\n\n# Iterate through each query-doc pair\nfor pair in data['query_doc_pairs']:\n # Initialize a dictionary to count the ratings for this pair\n ratings_count = defaultdict(int)\n \n # Extract judgements and count the ratings\n for judgement in pair['judgements']:\n rating = judgement['rating']\n ratings_count[rating] += 1\n\n # Append the counts to the ratings_data list\n for rating, count in ratings_count.items():\n ratings_data.append((pair['doc_id'], rating, count))\n", + "metadata": { + "trusted": true + }, + "execution_count": 6, + "outputs": [], + "id": "9a8561fd-2dbf-477e-9ac1-4df6d5ebdc91" + }, + { + "cell_type": "markdown", + "source": "## Step 3: Aggregate Raters' Data", + "metadata": {}, + "id": "caf5632b-132a-4e1b-80fe-c8c5ab7f2f3a" + }, + { + "cell_type": "code", + "source": "# Convert ratings_data to a DataFrame\ndf = pd.DataFrame(ratings_data, columns=['doc_id', 'rating', 'count'])\n\n# Use crosstab to create a contingency table\ndata_crosstab = pd.crosstab(index=df['doc_id'], columns=df['rating'], values=df['count'], aggfunc='sum')\n\n# Drop any rows missing judgements\ndata_crosstab = data_crosstab.dropna(how='any')\n\n# Convert the DataFrame to the format expected by aggregate_raters\ndata_for_aggregation = data_crosstab.values\n\n# Aggregate the raters' data\ntable, _ = aggregate_raters(data_for_aggregation)", + "metadata": { + "trusted": true + }, + "execution_count": 7, + "outputs": [], + "id": "a7598308-129b-4628-ad3a-fc3d703f8205" + }, + { + "cell_type": "markdown", + "source": "## Step 4: Compute Fleiss' Kappa", + "metadata": {}, + "id": "25c79fbc" + }, + { + "cell_type": "code", + "source": "kappa = fleiss_kappa(table, method='fleiss')\ndisplay(Markdown(f\"## Fleiss' Kappa: {kappa:.4f}\"))", + "metadata": { + "trusted": true + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "## Fleiss' Kappa: -0.3333" + }, + "metadata": {} + } + ], + "id": "25a613f9" + }, + { + "cell_type": "markdown", + "source": "_This notebook was last updated 17-FEB-2024_", + "metadata": {}, + "id": "5704579e-2321-4629-8de0-6608b428e2b6" + }, + { + "cell_type": "code", + "source": "", + "metadata": {}, + "execution_count": null, + "outputs": [], + "id": "7203f6cc-c068-4f75-a59a-1f49c5555319" + } + ] +} \ No newline at end of file diff --git a/jupyterlite/files/examples/Multiple Raters Analysis.ipynb b/jupyterlite/files/examples/Multiple Raters Analysis.ipynb new file mode 100644 index 0000000..1a262b4 --- /dev/null +++ b/jupyterlite/files/examples/Multiple Raters Analysis.ipynb @@ -0,0 +1,588 @@ +{ + "metadata": { + "kernelspec": { + "name": "python", + "display_name": "Python (Pyodide)", + "language": "python" + }, + "language_info": { + "codemirror_mode": { + "name": "python", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8" + } + }, + "nbformat_minor": 5, + "nbformat": 4, + "cells": [ + { + "cell_type": "markdown", + "source": "# Analysis of Judgements with Multiple Raters\n\nThis notebook is an example of how we can analyze query / document pairs judgements in the case of multiple judges.\n\n**Why?**\nWhen we have one single rater for our judgements, we have no other choice than trusting the rating of that judge. However when we have multiple judges rating the same pairs, this gives us much more information about the actual rating of the pair. For example if all judges disagree, it might be a good indicator that the pair is hard to judge, for example because the intent is not 100% clear. Conversely, if all judges aree, we can have much higher confidence in this rating.\n\n**Goal**\nAnalyze agreement between raters to measure confidence about each rating, in particular we would like to:\n\n - analyze the distribution of grades\n - measure raters's consistency\n - identify suspicious ratings in other to clean the dataset and make sure we only keep for which we have enough confidence in the rating.\n\n\n**Data**\nFor this we need a dataset with following information:\n \n - pairs \n - Each pair is rated by 3 different judges. In this example we consider a rating grade between 0 and 3:\n - `0`: document relevance is **poor**\n - `1`: document relevance is **fair**\n - `2`: document relevance is **good**\n - `3`: document relevance is **perfect**\n \nThe analysis would also work with other grade scale with minor changes in the code.\n\nTo get this data, we will directly get judgement book from Quepid, via the API. Any external data could also be used.\n\n \n", + "metadata": {}, + "id": "5aa3a7af" + }, + { + "cell_type": "markdown", + "source": "## Setup\nBasic libraries needed in this analysis", + "metadata": {}, + "id": "77cc28f8" + }, + { + "cell_type": "code", + "source": "import pandas as pd\nimport numpy as np\nimport matplotlib.pylab as plt\nimport random\nimport string\nfrom js import fetch\nfrom datetime import datetime\nimport random\nfrom matplotlib import pyplot\nfrom io import StringIO\n%matplotlib inline", + "metadata": { + "trusted": true + }, + "execution_count": 1, + "outputs": [], + "id": "ce97f3d0" + }, + { + "cell_type": "code", + "source": "ratings = ['0-Bad', '1-Fair', '2-Good', '3-Perfect']", + "metadata": { + "trusted": true + }, + "execution_count": 2, + "outputs": [], + "id": "d985176c-95c1-4d29-bc03-06f03cf77ed4" + }, + { + "cell_type": "markdown", + "source": "## Get Ratings From Books API", + "metadata": {}, + "id": "b57a896c-416b-4992-980d-2870de7cc120" + }, + { + "cell_type": "code", + "source": "# You need to get your book_id from Quepid UI. You should be able to see its content if you open /api/books/1.json\nBOOK_ID = 25", + "metadata": { + "trusted": true + }, + "execution_count": 3, + "outputs": [], + "id": "ed1f99f3-ff72-4e9d-81cd-3afef204cbef" + }, + { + "cell_type": "code", + "source": "# Get content of the book in CSV format (could also use JSON)\nres = await fetch(f'/api/books/{BOOK_ID}.csv')\nres_str = await res.text()\ndf = pd.read_csv(StringIO(res_str))\ndf", + "metadata": { + "trusted": true + }, + "execution_count": 4, + "outputs": [ + { + "execution_count": 4, + "output_type": "execute_result", + "data": { + "text/plain": " query docid charlie@flax.co.uk \\\n0 projector screen 325961 NaN \n1 projector screen 47471 NaN \n2 projector screen 126679 NaN \n3 projector screen 254441 NaN \n4 projector screen 325958 NaN \n... ... ... ... \n2415 power supply 1667352 NaN \n2416 power supply 1667804 NaN \n2417 power supply 1667752 NaN \n2418 power supply 1667821 NaN \n2419 power supply 1667357 NaN \n\n epugh@opensourceconnections.com eschramma@cas.org dtaivpp@gmail.com \\\n0 3.0 NaN 3.0 \n1 3.0 NaN 3.0 \n2 3.0 NaN 3.0 \n3 3.0 NaN NaN \n4 3.0 NaN NaN \n... ... ... ... \n2415 0.0 NaN NaN \n2416 0.0 NaN NaN \n2417 0.0 NaN NaN \n2418 0.0 NaN NaN \n2419 0.0 NaN NaN \n\n aarora@opensourceconnections.com cmcollier@gmail.com \\\n0 NaN NaN \n1 NaN NaN \n2 NaN NaN \n3 NaN NaN \n4 NaN NaN \n... ... ... \n2415 NaN NaN \n2416 NaN NaN \n2417 NaN NaN \n2418 NaN NaN \n2419 NaN NaN \n\n ben.w.trent@gmail.com jeff@vin.com cmarino@enterprise-knowledge.com \\\n0 NaN NaN NaN \n1 NaN NaN NaN \n2 NaN NaN NaN \n3 NaN NaN NaN \n4 NaN NaN NaN \n... ... ... ... \n2415 NaN NaN NaN \n2416 NaN NaN NaN \n2417 NaN NaN NaN \n2418 NaN NaN NaN \n2419 NaN NaN NaN \n\n msfroh@gmail.com peter@searchintuition.com maximilian.werk@jina.ai \\\n0 NaN NaN NaN \n1 NaN NaN NaN \n2 NaN NaN NaN \n3 NaN NaN NaN \n4 NaN NaN NaN \n... ... ... ... \n2415 NaN NaN NaN \n2416 NaN NaN NaN \n2417 NaN NaN NaN \n2418 NaN NaN NaN \n2419 NaN NaN NaN \n\n ryan.finley@ferguson.com \n0 NaN \n1 NaN \n2 NaN \n3 NaN \n4 NaN \n... ... \n2415 NaN \n2416 NaN \n2417 NaN \n2418 NaN \n2419 NaN \n\n[2420 rows x 15 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
querydocidcharlie@flax.co.ukepugh@opensourceconnections.comeschramma@cas.orgdtaivpp@gmail.comaarora@opensourceconnections.comcmcollier@gmail.comben.w.trent@gmail.comjeff@vin.comcmarino@enterprise-knowledge.commsfroh@gmail.competer@searchintuition.commaximilian.werk@jina.airyan.finley@ferguson.com
0projector screen325961NaN3.0NaN3.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
1projector screen47471NaN3.0NaN3.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
2projector screen126679NaN3.0NaN3.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
3projector screen254441NaN3.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4projector screen325958NaN3.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
................................................
2415power supply1667352NaN0.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2416power supply1667804NaN0.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2417power supply1667752NaN0.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2418power supply1667821NaN0.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2419power supply1667357NaN0.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n

2420 rows × 15 columns

\n
" + }, + "metadata": {} + } + ], + "id": "30ce3dc2-c049-436c-8049-58c0d8f4e46f" + }, + { + "cell_type": "code", + "source": "if df.shape[0] == 0:\n print('It Looks like your book is empty or does not exists')", + "metadata": { + "trusted": true + }, + "execution_count": 5, + "outputs": [], + "id": "98ad3844-d67f-44a2-8bae-034223de6c68" + }, + { + "cell_type": "code", + "source": "df.dropna(inplace=True)\ndf.shape", + "metadata": { + "trusted": true + }, + "execution_count": 6, + "outputs": [ + { + "execution_count": 6, + "output_type": "execute_result", + "data": { + "text/plain": "(0, 15)" + }, + "metadata": {} + } + ], + "id": "79ce92af-e034-4e7d-bc8d-3a96c0fcd14f" + }, + { + "cell_type": "code", + "source": "df.loc[df['docid'] == '325961']", + "metadata": { + "trusted": true + }, + "execution_count": 7, + "outputs": [ + { + "execution_count": 7, + "output_type": "execute_result", + "data": { + "text/plain": "Empty DataFrame\nColumns: [query, docid, charlie@flax.co.uk, epugh@opensourceconnections.com, eschramma@cas.org, dtaivpp@gmail.com, aarora@opensourceconnections.com, cmcollier@gmail.com, ben.w.trent@gmail.com, jeff@vin.com, cmarino@enterprise-knowledge.com, msfroh@gmail.com, peter@searchintuition.com, maximilian.werk@jina.ai, ryan.finley@ferguson.com]\nIndex: []", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
querydocidcharlie@flax.co.ukepugh@opensourceconnections.comeschramma@cas.orgdtaivpp@gmail.comaarora@opensourceconnections.comcmcollier@gmail.comben.w.trent@gmail.comjeff@vin.comcmarino@enterprise-knowledge.commsfroh@gmail.competer@searchintuition.commaximilian.werk@jina.airyan.finley@ferguson.com
\n
" + }, + "metadata": {} + } + ], + "id": "25006e0e-2013-4384-a34f-8fa920579614" + }, + { + "cell_type": "code", + "source": "raters = list(df.columns[2:])\nraters", + "metadata": { + "trusted": true + }, + "execution_count": 8, + "outputs": [ + { + "execution_count": 8, + "output_type": "execute_result", + "data": { + "text/plain": "['charlie@flax.co.uk',\n 'epugh@opensourceconnections.com',\n 'eschramma@cas.org',\n 'dtaivpp@gmail.com',\n 'aarora@opensourceconnections.com',\n 'cmcollier@gmail.com',\n 'ben.w.trent@gmail.com',\n 'jeff@vin.com',\n 'cmarino@enterprise-knowledge.com',\n 'msfroh@gmail.com',\n 'peter@searchintuition.com',\n 'maximilian.werk@jina.ai',\n 'ryan.finley@ferguson.com']" + }, + "metadata": {} + } + ], + "id": "72d4481e-ae12-4fff-bbbd-1888a894f69a" + }, + { + "cell_type": "code", + "source": "nb_raters = len(raters)", + "metadata": { + "trusted": true + }, + "execution_count": 9, + "outputs": [], + "id": "4c1fc91f-cda6-4e76-8372-3062e6975adb" + }, + { + "cell_type": "markdown", + "source": "We just transform a bit the data so that it's easier to process:", + "metadata": {}, + "id": "d704b517-aea3-4177-bb49-88f8d57ce647" + }, + { + "cell_type": "code", + "source": "df.rename(columns={rn:f'rating_{i}' for i,rn in enumerate(raters)}, inplace=True)\nfor i, rn in enumerate(raters):\n df[f'rater_{i}'] = rn\ndf", + "metadata": { + "trusted": true + }, + "execution_count": 10, + "outputs": [ + { + "execution_count": 10, + "output_type": "execute_result", + "data": { + "text/plain": "Empty DataFrame\nColumns: [query, docid, rating_0, rating_1, rating_2, rating_3, rating_4, rating_5, rating_6, rating_7, rating_8, rating_9, rating_10, rating_11, rating_12, rater_0, rater_1, rater_2, rater_3, rater_4, rater_5, rater_6, rater_7, rater_8, rater_9, rater_10, rater_11, rater_12]\nIndex: []\n\n[0 rows x 28 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
querydocidrating_0rating_1rating_2rating_3rating_4rating_5rating_6rating_7...rater_3rater_4rater_5rater_6rater_7rater_8rater_9rater_10rater_11rater_12
\n

0 rows × 28 columns

\n
" + }, + "metadata": {} + } + ], + "id": "e81fbcd6-f2aa-476b-9b81-2ae7a950fe99" + }, + { + "cell_type": "markdown", + "source": "We flatten the data to have 1 rating per row:", + "metadata": {}, + "id": "c0396b4d-eb54-4786-939e-b8ad69d335ab" + }, + { + "cell_type": "code", + "source": "df_overall = pd.concat([\n df[['query','docid',f'rating_{i}', f'rater_{i}']].rename(\n columns={f'rating_{i}':'rating', f'rater_{i}':'rater'}) for i in range(nb_raters)]).reset_index(drop=True)\ndf_overall.sort_values(by=['docid'], inplace=True)\ndf_overall", + "metadata": { + "trusted": true + }, + "execution_count": 11, + "outputs": [ + { + "execution_count": 11, + "output_type": "execute_result", + "data": { + "text/plain": "Empty DataFrame\nColumns: [query, docid, rating, rater]\nIndex: []", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n
querydocidratingrater
\n
" + }, + "metadata": {} + } + ], + "id": "97b0bc0c-a20e-49b2-a65a-ef09eb7e6a58" + }, + { + "cell_type": "markdown", + "source": "### Rating distribution per query\nHe we just want to plot the distribution of ratings for each query:\n", + "metadata": {}, + "id": "ff2dcb29-e376-4621-9ecf-f3374e71f464" + }, + { + "cell_type": "code", + "source": "df_overall[['query', 'rating']].groupby('query').agg(['count', 'mean', 'std'])", + "metadata": { + "trusted": true + }, + "execution_count": 12, + "outputs": [ + { + "execution_count": 12, + "output_type": "execute_result", + "data": { + "text/plain": "Empty DataFrame\nColumns: [(rating, count), (rating, mean), (rating, std)]\nIndex: []", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
rating
countmeanstd
query
\n
" + }, + "metadata": {} + } + ], + "id": "50d46a70" + }, + { + "cell_type": "code", + "source": "fig, axes = plt.subplots()\nqueries = df_overall['query'].unique()\ndataset = [df_overall[df_overall['query'] == q][\"rating\"] for q in queries]\n\nnb_queries = len(queries)\n\naxes.violinplot(dataset = dataset, showmeans=True, bw_method=0.05)\naxes.set_xlabel('query')\naxes.set_ylabel('ratings')\naxes.yaxis.grid(True)\naxes.set_xticks(range(1,nb_queries+1))\naxes.set_xticklabels(queries,rotation=45)\nplt.title('Rating distribution per query')\nplt.show()", + "metadata": { + "trusted": true + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": {} + }, + { + "ename": "", + "evalue": "zero-size array to reduction operation minimum which has no identity", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 7\u001b[0m\n\u001b[1;32m 3\u001b[0m dataset \u001b[38;5;241m=\u001b[39m [df_overall[df_overall[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mquery\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m q][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m q \u001b[38;5;129;01min\u001b[39;00m queries]\n\u001b[1;32m 5\u001b[0m nb_queries \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(queries)\n\u001b[0;32m----> 7\u001b[0m \u001b[43maxes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mviolinplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshowmeans\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbw_method\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.05\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m axes\u001b[38;5;241m.\u001b[39mset_xlabel(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mquery\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 9\u001b[0m axes\u001b[38;5;241m.\u001b[39mset_ylabel(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mratings\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[0;32m/lib/python3.11/site-packages/matplotlib/__init__.py:1412\u001b[0m, in \u001b[0;36m_preprocess_data..inner\u001b[0;34m(ax, data, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1409\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 1410\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(ax, \u001b[38;5;241m*\u001b[39margs, data\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 1411\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m data \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1412\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43max\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mmap\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msanitize_sequence\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1414\u001b[0m bound \u001b[38;5;241m=\u001b[39m new_sig\u001b[38;5;241m.\u001b[39mbind(ax, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1415\u001b[0m auto_label \u001b[38;5;241m=\u001b[39m (bound\u001b[38;5;241m.\u001b[39marguments\u001b[38;5;241m.\u001b[39mget(label_namer)\n\u001b[1;32m 1416\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m bound\u001b[38;5;241m.\u001b[39mkwargs\u001b[38;5;241m.\u001b[39mget(label_namer))\n", + "File \u001b[0;32m/lib/python3.11/site-packages/matplotlib/axes/_axes.py:7938\u001b[0m, in \u001b[0;36mAxes.violinplot\u001b[0;34m(self, dataset, positions, vert, widths, showmeans, showextrema, showmedians, quantiles, points, bw_method)\u001b[0m\n\u001b[1;32m 7935\u001b[0m kde \u001b[38;5;241m=\u001b[39m mlab\u001b[38;5;241m.\u001b[39mGaussianKDE(X, bw_method)\n\u001b[1;32m 7936\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m kde\u001b[38;5;241m.\u001b[39mevaluate(coords)\n\u001b[0;32m-> 7938\u001b[0m vpstats \u001b[38;5;241m=\u001b[39m \u001b[43mcbook\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mviolin_stats\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_kde_method\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpoints\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7939\u001b[0m \u001b[43m \u001b[49m\u001b[43mquantiles\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquantiles\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7940\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mviolin(vpstats, positions\u001b[38;5;241m=\u001b[39mpositions, vert\u001b[38;5;241m=\u001b[39mvert,\n\u001b[1;32m 7941\u001b[0m widths\u001b[38;5;241m=\u001b[39mwidths, showmeans\u001b[38;5;241m=\u001b[39mshowmeans,\n\u001b[1;32m 7942\u001b[0m showextrema\u001b[38;5;241m=\u001b[39mshowextrema, showmedians\u001b[38;5;241m=\u001b[39mshowmedians)\n", + "File \u001b[0;32m/lib/python3.11/site-packages/matplotlib/cbook/__init__.py:1447\u001b[0m, in \u001b[0;36mviolin_stats\u001b[0;34m(X, method, points, quantiles)\u001b[0m\n\u001b[1;32m 1444\u001b[0m stats \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 1446\u001b[0m \u001b[38;5;66;03m# Calculate basic stats for the distribution\u001b[39;00m\n\u001b[0;32m-> 1447\u001b[0m min_val \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1448\u001b[0m max_val \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmax(x)\n\u001b[1;32m 1449\u001b[0m quantile_val \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mpercentile(x, \u001b[38;5;241m100\u001b[39m \u001b[38;5;241m*\u001b[39m q)\n", + "File \u001b[0;32m<__array_function__ internals>:200\u001b[0m, in \u001b[0;36mamin\u001b[0;34m(*args, **kwargs)\u001b[0m\n", + "File \u001b[0;32m/lib/python3.11/site-packages/numpy/core/fromnumeric.py:2946\u001b[0m, in \u001b[0;36mamin\u001b[0;34m(a, axis, out, keepdims, initial, where)\u001b[0m\n\u001b[1;32m 2829\u001b[0m \u001b[38;5;129m@array_function_dispatch\u001b[39m(_amin_dispatcher)\n\u001b[1;32m 2830\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mamin\u001b[39m(a, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, out\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, keepdims\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39m_NoValue, initial\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39m_NoValue,\n\u001b[1;32m 2831\u001b[0m where\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39m_NoValue):\n\u001b[1;32m 2832\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 2833\u001b[0m \u001b[38;5;124;03m Return the minimum of an array or minimum along an axis.\u001b[39;00m\n\u001b[1;32m 2834\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2944\u001b[0m \u001b[38;5;124;03m 6\u001b[39;00m\n\u001b[1;32m 2945\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2946\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_wrapreduction\u001b[49m\u001b[43m(\u001b[49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mminimum\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmin\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2947\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeepdims\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeepdims\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minitial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minitial\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwhere\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwhere\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/lib/python3.11/site-packages/numpy/core/fromnumeric.py:86\u001b[0m, in \u001b[0;36m_wrapreduction\u001b[0;34m(obj, ufunc, method, axis, dtype, out, **kwargs)\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m reduction(axis\u001b[38;5;241m=\u001b[39maxis, out\u001b[38;5;241m=\u001b[39mout, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mpasskwargs)\n\u001b[0;32m---> 86\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mufunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreduce\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpasskwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mValueError\u001b[0m: zero-size array to reduction operation minimum which has no identity" + ], + "output_type": "error" + } + ], + "id": "70b93e5d-425d-4925-97fd-1b062f7c373f" + }, + { + "cell_type": "markdown", + "source": "### Rating distribution per rater\n\nAssuming raters have seen the same pairs and that they had the same guidelines, their ratings should be overall calibrated. This is what we want to check here. \nWe could easily detect outlier raters that rate very different to others, for example because they haven't really understood the guidelines.", + "metadata": {}, + "id": "fa14bfdd-ea38-4fcc-9532-4019306410b1" + }, + { + "cell_type": "code", + "source": "fig, axes = plt.subplots()\nraters = df_overall['rater'].unique()\ndataset = [df_overall[df_overall['rater'] == r][\"rating\"] for r in raters]\n\naxes.violinplot(dataset = dataset, showmeans=True, bw_method=0.05)\naxes.set_xlabel('rater')\naxes.set_ylabel('ratings')\naxes.yaxis.grid(True)\naxes.set_xticks(range(1,nb_raters+1))\naxes.set_xticklabels(raters)\nplt.title('Rating distribution per rater')\nplt.show()", + "metadata": { + "trusted": true + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": {} + }, + { + "ename": "", + "evalue": "zero-size array to reduction operation minimum which has no identity", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[14], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m raters \u001b[38;5;241m=\u001b[39m df_overall[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrater\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39munique()\n\u001b[1;32m 3\u001b[0m dataset \u001b[38;5;241m=\u001b[39m [df_overall[df_overall[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrater\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m r][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m r \u001b[38;5;129;01min\u001b[39;00m raters]\n\u001b[0;32m----> 5\u001b[0m \u001b[43maxes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mviolinplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshowmeans\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbw_method\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.05\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m axes\u001b[38;5;241m.\u001b[39mset_xlabel(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrater\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 7\u001b[0m axes\u001b[38;5;241m.\u001b[39mset_ylabel(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mratings\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[0;32m/lib/python3.11/site-packages/matplotlib/__init__.py:1412\u001b[0m, in \u001b[0;36m_preprocess_data..inner\u001b[0;34m(ax, data, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1409\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 1410\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(ax, \u001b[38;5;241m*\u001b[39margs, data\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 1411\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m data \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1412\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43max\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mmap\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msanitize_sequence\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1414\u001b[0m bound \u001b[38;5;241m=\u001b[39m new_sig\u001b[38;5;241m.\u001b[39mbind(ax, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1415\u001b[0m auto_label \u001b[38;5;241m=\u001b[39m (bound\u001b[38;5;241m.\u001b[39marguments\u001b[38;5;241m.\u001b[39mget(label_namer)\n\u001b[1;32m 1416\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m bound\u001b[38;5;241m.\u001b[39mkwargs\u001b[38;5;241m.\u001b[39mget(label_namer))\n", + "File \u001b[0;32m/lib/python3.11/site-packages/matplotlib/axes/_axes.py:7938\u001b[0m, in \u001b[0;36mAxes.violinplot\u001b[0;34m(self, dataset, positions, vert, widths, showmeans, showextrema, showmedians, quantiles, points, bw_method)\u001b[0m\n\u001b[1;32m 7935\u001b[0m kde \u001b[38;5;241m=\u001b[39m mlab\u001b[38;5;241m.\u001b[39mGaussianKDE(X, bw_method)\n\u001b[1;32m 7936\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m kde\u001b[38;5;241m.\u001b[39mevaluate(coords)\n\u001b[0;32m-> 7938\u001b[0m vpstats \u001b[38;5;241m=\u001b[39m \u001b[43mcbook\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mviolin_stats\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_kde_method\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpoints\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7939\u001b[0m \u001b[43m \u001b[49m\u001b[43mquantiles\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquantiles\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7940\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mviolin(vpstats, positions\u001b[38;5;241m=\u001b[39mpositions, vert\u001b[38;5;241m=\u001b[39mvert,\n\u001b[1;32m 7941\u001b[0m widths\u001b[38;5;241m=\u001b[39mwidths, showmeans\u001b[38;5;241m=\u001b[39mshowmeans,\n\u001b[1;32m 7942\u001b[0m showextrema\u001b[38;5;241m=\u001b[39mshowextrema, showmedians\u001b[38;5;241m=\u001b[39mshowmedians)\n", + "File \u001b[0;32m/lib/python3.11/site-packages/matplotlib/cbook/__init__.py:1447\u001b[0m, in \u001b[0;36mviolin_stats\u001b[0;34m(X, method, points, quantiles)\u001b[0m\n\u001b[1;32m 1444\u001b[0m stats \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 1446\u001b[0m \u001b[38;5;66;03m# Calculate basic stats for the distribution\u001b[39;00m\n\u001b[0;32m-> 1447\u001b[0m min_val \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1448\u001b[0m max_val \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmax(x)\n\u001b[1;32m 1449\u001b[0m quantile_val \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mpercentile(x, \u001b[38;5;241m100\u001b[39m \u001b[38;5;241m*\u001b[39m q)\n", + "File \u001b[0;32m<__array_function__ internals>:200\u001b[0m, in \u001b[0;36mamin\u001b[0;34m(*args, **kwargs)\u001b[0m\n", + "File \u001b[0;32m/lib/python3.11/site-packages/numpy/core/fromnumeric.py:2946\u001b[0m, in \u001b[0;36mamin\u001b[0;34m(a, axis, out, keepdims, initial, where)\u001b[0m\n\u001b[1;32m 2829\u001b[0m \u001b[38;5;129m@array_function_dispatch\u001b[39m(_amin_dispatcher)\n\u001b[1;32m 2830\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mamin\u001b[39m(a, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, out\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, keepdims\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39m_NoValue, initial\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39m_NoValue,\n\u001b[1;32m 2831\u001b[0m where\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39m_NoValue):\n\u001b[1;32m 2832\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 2833\u001b[0m \u001b[38;5;124;03m Return the minimum of an array or minimum along an axis.\u001b[39;00m\n\u001b[1;32m 2834\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2944\u001b[0m \u001b[38;5;124;03m 6\u001b[39;00m\n\u001b[1;32m 2945\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2946\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_wrapreduction\u001b[49m\u001b[43m(\u001b[49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mminimum\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmin\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2947\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeepdims\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeepdims\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minitial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minitial\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwhere\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwhere\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/lib/python3.11/site-packages/numpy/core/fromnumeric.py:86\u001b[0m, in \u001b[0;36m_wrapreduction\u001b[0;34m(obj, ufunc, method, axis, dtype, out, **kwargs)\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m reduction(axis\u001b[38;5;241m=\u001b[39maxis, out\u001b[38;5;241m=\u001b[39mout, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mpasskwargs)\n\u001b[0;32m---> 86\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mufunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreduce\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpasskwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mValueError\u001b[0m: zero-size array to reduction operation minimum which has no identity" + ], + "output_type": "error" + } + ], + "id": "f954846d-54a9-4cf1-a9da-ab0faaa46df9" + }, + { + "cell_type": "markdown", + "source": "## Ratings Analysis\n\nWe now focus on the ratings themselves. We first plot the overall rating distributions", + "metadata": {}, + "id": "6168e768" + }, + { + "cell_type": "markdown", + "source": "### Overall rating distribution", + "metadata": {}, + "id": "92f38adf" + }, + { + "cell_type": "code", + "source": "from matplotlib.ticker import MaxNLocator\nplt.hist(df_overall['rating'])\nplt.title('Overall ratings')\nplt.xticks(range(len(ratings)), ratings,\n rotation=60) \nNone\nplt.ylabel('nb_ratings')\nplt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))\n\nplt.grid()", + "metadata": { + "trusted": true + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": {} + } + ], + "id": "078955ba" + }, + { + "cell_type": "markdown", + "source": "### Agreements between raters\nIn this section we want to check how much raters agree or disagree.\n\n - If all raters agreeing on a rating will give us confidence about the rating\n - If all raters disagree, for example, 1 rating `Poor` another one rating `Fair` and the third one rating `Perfect`, then something draws our attention. Either the intent was not clear or raters did not rate as expected. ", + "metadata": {}, + "id": "cf354a46-d693-47e5-8148-fbef8e8d8b25" + }, + { + "cell_type": "code", + "source": "vals = []\nlabs = []\ncols = []\nfor agent,col in (('rating_0','blue'),\n ('rating_1','red'),\n ('rating_2', 'green')\n ):\n vals.append(df[agent])\n labs.append(agent)\n cols.append(col)\nplt.hist(vals, color=cols, label=labs)\n\nplt.legend()\nplt.title('Overall ratings')\nplt.xticks(range(len(ratings)), ratings, rotation=60) \nNone", + "metadata": { + "trusted": true + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": {} + } + ], + "id": "cf6869d0-3498-4d44-9bf6-d55e580916d1" + }, + { + "cell_type": "code", + "source": "def nb_distinct_ratings(r):\n return len({r['rating_0'], r['rating_1'], r['rating_2']})\ndf['nb_distinct_ratings'] = df.apply(nb_distinct_ratings, axis=1)\ndf['nb_distinct_ratings'].hist()\nplt.xticks([1, 2, 3], ['3 Agree', '2 Agree', 'All disagree'],\n rotation=45) \nplt.ylabel('nb cases')\nplt.title('Agents agreements')", + "metadata": { + "trusted": true + }, + "execution_count": 17, + "outputs": [ + { + "execution_count": 17, + "output_type": "execute_result", + "data": { + "text/plain": "Text(0.5, 1.0, 'Agents agreements')" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": {} + } + ], + "id": "fbbb3ed9-dcab-4c20-8a08-fa689b62db5d" + }, + { + "cell_type": "markdown", + "source": "Some suspicious cases we will further investigate later on:", + "metadata": {}, + "id": "7fb17be1-df26-44fc-82d3-88c5255ca5ee" + }, + { + "cell_type": "code", + "source": "print('All disagree:')\ndf[df['nb_distinct_ratings']==3]", + "metadata": { + "trusted": true + }, + "execution_count": 18, + "outputs": [ + { + "name": "stdout", + "text": "All disagree:\n", + "output_type": "stream" + }, + { + "execution_count": 18, + "output_type": "execute_result", + "data": { + "text/plain": "Empty DataFrame\nColumns: [query, docid, rating_0, rating_1, rating_2, rating_3, rating_4, rating_5, rating_6, rating_7, rating_8, rating_9, rating_10, rating_11, rating_12, rater_0, rater_1, rater_2, rater_3, rater_4, rater_5, rater_6, rater_7, rater_8, rater_9, rater_10, rater_11, rater_12, nb_distinct_ratings]\nIndex: []\n\n[0 rows x 29 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
querydocidrating_0rating_1rating_2rating_3rating_4rating_5rating_6rating_7...rater_4rater_5rater_6rater_7rater_8rater_9rater_10rater_11rater_12nb_distinct_ratings
\n

0 rows × 29 columns

\n
" + }, + "metadata": {} + } + ], + "id": "7d9b3876-17b5-42a0-97de-d9007b777aad" + }, + { + "cell_type": "markdown", + "source": "Some cases where all raters agree, we can have good confidence in the rating:", + "metadata": {}, + "id": "b4a60ac4-4c2b-4477-b7c9-285fb29f843c" + }, + { + "cell_type": "code", + "source": "print('All agree:')\ndf[df['nb_distinct_ratings']==1].sample(5)", + "metadata": { + "trusted": true + }, + "execution_count": 19, + "outputs": [ + { + "name": "stdout", + "text": "All agree:\n", + "output_type": "stream" + }, + { + "ename": "", + "evalue": "a must be greater than 0 unless no samples are taken", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[19], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAll agree:\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnb_distinct_ratings\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/lib/python3.11/site-packages/pandas/core/generic.py:5773\u001b[0m, in \u001b[0;36mNDFrame.sample\u001b[0;34m(self, n, frac, replace, weights, random_state, axis, ignore_index)\u001b[0m\n\u001b[1;32m 5770\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m weights \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 5771\u001b[0m weights \u001b[38;5;241m=\u001b[39m sample\u001b[38;5;241m.\u001b[39mpreprocess_weights(\u001b[38;5;28mself\u001b[39m, weights, axis)\n\u001b[0;32m-> 5773\u001b[0m sampled_indices \u001b[38;5;241m=\u001b[39m \u001b[43msample\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj_len\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreplace\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweights\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5774\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(sampled_indices, axis\u001b[38;5;241m=\u001b[39maxis)\n\u001b[1;32m 5776\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ignore_index:\n", + "File \u001b[0;32m/lib/python3.11/site-packages/pandas/core/sample.py:150\u001b[0m, in \u001b[0;36msample\u001b[0;34m(obj_len, size, replace, weights, random_state)\u001b[0m\n\u001b[1;32m 147\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid weights: weights sum to zero\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 150\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchoice\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj_len\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreplace\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mp\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweights\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mastype(\n\u001b[1;32m 151\u001b[0m np\u001b[38;5;241m.\u001b[39mintp, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 152\u001b[0m )\n", + "File \u001b[0;32mmtrand.pyx:928\u001b[0m, in \u001b[0;36mnumpy.random.mtrand.RandomState.choice\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: a must be greater than 0 unless no samples are taken" + ], + "output_type": "error" + } + ], + "id": "8b065a8e-c76c-4e8b-ab5e-e1ec25db6a60" + }, + { + "cell_type": "markdown", + "source": "Not perfect but 2 / 3 raters agreed on the rating value:", + "metadata": {}, + "id": "294c4445-1c81-4c73-96ac-6001f3f83ff2" + }, + { + "cell_type": "code", + "source": "print('Majority agree:')\ndf[df['nb_distinct_ratings']==2]", + "metadata": { + "trusted": true + }, + "execution_count": 20, + "outputs": [ + { + "name": "stdout", + "text": "Majority agree:\n", + "output_type": "stream" + }, + { + "execution_count": 20, + "output_type": "execute_result", + "data": { + "text/plain": "Empty DataFrame\nColumns: [query, docid, rating_0, rating_1, rating_2, rating_3, rating_4, rating_5, rating_6, rating_7, rating_8, rating_9, rating_10, rating_11, rating_12, rater_0, rater_1, rater_2, rater_3, rater_4, rater_5, rater_6, rater_7, rater_8, rater_9, rater_10, rater_11, rater_12, nb_distinct_ratings]\nIndex: []\n\n[0 rows x 29 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
querydocidrating_0rating_1rating_2rating_3rating_4rating_5rating_6rating_7...rater_4rater_5rater_6rater_7rater_8rater_9rater_10rater_11rater_12nb_distinct_ratings
\n

0 rows × 29 columns

\n
" + }, + "metadata": {} + } + ], + "id": "2b300fa0-4876-46c6-9526-91afe73b1d7a" + }, + { + "cell_type": "markdown", + "source": "### Big discrepancies\n\nThos cases are the most suspicious ones. Some raters rated as `3-Perfect` some other rated as '1-Fair' or '0-Bad' (or at least a difference of 2 grades).\nThere can be several reasons for that:\n - there is an issue with the pair: query not clear, document not clear\n - guidelines not well specified: a very common case is when the guidelines are not 100% clear to the raters. For example, what happens if the image of the results is relevant but the text is not? Or if some document fields are missing?\n - no clear intent. Was the intent of the query clear enough? This can cause confusion to the raters. It's important to give an opportunity to the rater to say `I don't know how to rate this pair!`\n", + "metadata": {}, + "id": "a2b7880d-4707-4cf0-ad2e-20f5ee1f912e" + }, + { + "cell_type": "code", + "source": "def big_discrepancy(r):\n \"\"\"returns 1 if there is at least one 2 grades between 1 rating and another, 0 otherwise\"\"\"\n ratings = [r[f'rating_{i}'] for i in range(nb_raters)]\n return 1 if max(ratings) - min(ratings) >=2 else 0\ndf['big_discrepancy'] = df.apply(big_discrepancy, axis=1)\ndf[df['big_discrepancy']==1].sample(2)", + "metadata": { + "trusted": true + }, + "execution_count": 21, + "outputs": [ + { + "ename": "", + "evalue": "a must be greater than 0 unless no samples are taken", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[21], line 6\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m1\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mmax\u001b[39m(ratings) \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mmin\u001b[39m(ratings) \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 5\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbig_discrepancy\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mapply(big_discrepancy, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m----> 6\u001b[0m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbig_discrepancy\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/lib/python3.11/site-packages/pandas/core/generic.py:5773\u001b[0m, in \u001b[0;36mNDFrame.sample\u001b[0;34m(self, n, frac, replace, weights, random_state, axis, ignore_index)\u001b[0m\n\u001b[1;32m 5770\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m weights \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 5771\u001b[0m weights \u001b[38;5;241m=\u001b[39m sample\u001b[38;5;241m.\u001b[39mpreprocess_weights(\u001b[38;5;28mself\u001b[39m, weights, axis)\n\u001b[0;32m-> 5773\u001b[0m sampled_indices \u001b[38;5;241m=\u001b[39m \u001b[43msample\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj_len\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreplace\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweights\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5774\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(sampled_indices, axis\u001b[38;5;241m=\u001b[39maxis)\n\u001b[1;32m 5776\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ignore_index:\n", + "File \u001b[0;32m/lib/python3.11/site-packages/pandas/core/sample.py:150\u001b[0m, in \u001b[0;36msample\u001b[0;34m(obj_len, size, replace, weights, random_state)\u001b[0m\n\u001b[1;32m 147\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid weights: weights sum to zero\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 150\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchoice\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj_len\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreplace\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mp\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweights\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mastype(\n\u001b[1;32m 151\u001b[0m np\u001b[38;5;241m.\u001b[39mintp, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 152\u001b[0m )\n", + "File \u001b[0;32mmtrand.pyx:928\u001b[0m, in \u001b[0;36mnumpy.random.mtrand.RandomState.choice\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: a must be greater than 0 unless no samples are taken" + ], + "output_type": "error" + } + ], + "id": "a784ae64-cb70-43dd-8bca-459a60663987" + }, + { + "cell_type": "markdown", + "source": "### Overall confusion between raters\nConfusion analysis allows to understand the types of disagreements between raters. \nIn a health rating setup we would expect to have most of the confusions between `0` and `1` or `1` and `2`.\n", + "metadata": {}, + "id": "9df4a62c-df0d-46fd-bfa0-c96c9158542c" + }, + { + "cell_type": "code", + "source": "y1 = []\ny2 = []\nfor i,r in df.iterrows():\n #12\n y1.append(r['rating_0'])\n y2.append(r['rating_1'])\n\n ", + "metadata": { + "trusted": true + }, + "execution_count": 22, + "outputs": [], + "id": "6d8a99b9-f823-4829-aed8-9e376a0dfa73" + }, + { + "cell_type": "code", + "source": "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\ncm = confusion_matrix(y1, y2, labels=range(len(ratings)))\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm,\n display_labels=ratings)\nplt.figure(figsize=(8,8))\ndisp.plot(xticks_rotation=45, colorbar=False, cmap=plt.cm.Blues, ax=plt.gca(), values_format='d')\nplt.xlabel('');plt.ylabel('')\nplt.gca().xaxis.tick_top()", + "metadata": { + "trusted": true + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": {} + } + ], + "id": "1a4f7484-185c-42a3-8931-32a28a6d6964" + }, + { + "cell_type": "markdown", + "source": "This work has been provided by Wallapop Search team (http://www.wallapop.com/).\n\n_This notebook was last updated 17-FEB-2024_", + "metadata": {}, + "id": "7c08fac4-ea2e-4aae-92e8-cde6739dc131" + }, + { + "cell_type": "code", + "source": "", + "metadata": {}, + "execution_count": null, + "outputs": [], + "id": "349421e4-dab9-42ef-afba-c2d9df1a9929" + } + ] +} \ No newline at end of file