From 66e33cb59eb5e10db4bd275f96e6267584e7f6a8 Mon Sep 17 00:00:00 2001
From: Ilya Matiach <ilmat@microsoft.com>
Date: Fri, 26 Jan 2024 17:00:49 -0500
Subject: [PATCH] fix builds by supporting sparse_output renamed parameter in
 OneHotEncoder in new scikit-learn version update (#2507)

---
 .github/workflows/CI-python.yml                      |  6 ++++++
 ...ranalysis-interpretability-dashboard-census.ipynb | 10 +++++++++-
 ...oard-housing-classification-model-debugging.ipynb | 12 ++++++++++--
 ...ponsibleaidashboard-housing-decision-making.ipynb | 12 ++++++++++--
 .../models/sklearn/sklearn_model_utils.py            | 11 +++++++++--
 5 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/CI-python.yml b/.github/workflows/CI-python.yml
index fcaae683e0..b9d9485a9d 100644
--- a/.github/workflows/CI-python.yml
+++ b/.github/workflows/CI-python.yml
@@ -58,6 +58,12 @@ jobs:
           pip install -v -e .
         working-directory: ${{ matrix.packageDirectory }}
 
+      - if: ${{ (matrix.packageDirectory == 'erroranalysis') || (matrix.packageDirectory == 'responsibleai') }}
+        name: Install rai_test_utils locally until next version is released
+        run: |
+          pip install -v -e .
+        working-directory: rai_test_utils
+
       - name: Pip freeze
         run: |
           pip freeze > installed-requirements-dev.txt
diff --git a/notebooks/individual-dashboards/erroranalysis-dashboard/erroranalysis-interpretability-dashboard-census.ipynb b/notebooks/individual-dashboards/erroranalysis-dashboard/erroranalysis-interpretability-dashboard-census.ipynb
index d8ad57e2f0..d5ba5c57b4 100644
--- a/notebooks/individual-dashboards/erroranalysis-dashboard/erroranalysis-interpretability-dashboard-census.ipynb
+++ b/notebooks/individual-dashboards/erroranalysis-dashboard/erroranalysis-interpretability-dashboard-census.ipynb
@@ -118,11 +118,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from packaging import version\n",
+    "import sklearn\n",
     "from sklearn.pipeline import Pipeline\n",
     "from sklearn.impute import SimpleImputer\n",
     "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
     "from sklearn.compose import ColumnTransformer\n",
     "\n",
+    "# for older scikit-learn versions use sparse, for newer sparse_output:\n",
+    "if version.parse(sklearn.__version__) < version.parse('1.2'):\n",
+    "    ohe_params = {\"sparse\": False}\n",
+    "else:\n",
+    "    ohe_params = {\"sparse_output\": False}\n",
+    "\n",
     "def split_label(dataset):\n",
     "    X = dataset.drop(['income'], axis=1)\n",
     "    y = dataset[['income']]\n",
@@ -141,7 +149,7 @@
     "    ])\n",
     "    cat_pipe = Pipeline([\n",
     "        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='?')),\n",
-    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))\n",
+    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', **ohe_params))\n",
     "    ])\n",
     "    feat_pipe = ColumnTransformer([\n",
     "        ('num_pipe', num_pipe, pipe_cfg['num_cols']),\n",
diff --git a/notebooks/responsibleaidashboard/tabular/responsibleaidashboard-housing-classification-model-debugging.ipynb b/notebooks/responsibleaidashboard/tabular/responsibleaidashboard-housing-classification-model-debugging.ipynb
index 490b33a119..ed1c234e2d 100644
--- a/notebooks/responsibleaidashboard/tabular/responsibleaidashboard-housing-classification-model-debugging.ipynb
+++ b/notebooks/responsibleaidashboard/tabular/responsibleaidashboard-housing-classification-model-debugging.ipynb
@@ -70,12 +70,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from packaging import version\n",
     "from raiutils.dataset import fetch_dataset\n",
+    "import sklearn\n",
     "from sklearn.pipeline import Pipeline\n",
     "from sklearn.impute import SimpleImputer\n",
     "from sklearn.preprocessing import OneHotEncoder\n",
     "from sklearn.compose import ColumnTransformer\n",
     "\n",
+    "# for older scikit-learn versions use sparse, for newer sparse_output:\n",
+    "if version.parse(sklearn.__version__) < version.parse('1.2'):\n",
+    "    ohe_params = {\"sparse\": False}\n",
+    "else:\n",
+    "    ohe_params = {\"sparse_output\": False}\n",
+    "\n",
     "def split_label(dataset, target_feature):\n",
     "    X = dataset.drop([target_feature], axis=1)\n",
     "    y = dataset[[target_feature]]\n",
@@ -93,7 +101,7 @@
     "    ])\n",
     "    cat_pipe = Pipeline([\n",
     "        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='?')),\n",
-    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))\n",
+    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', **ohe_params))\n",
     "    ])\n",
     "    feat_pipe = ColumnTransformer([\n",
     "        ('num_pipe', num_pipe, pipe_cfg['num_cols']),\n",
@@ -179,7 +187,7 @@
    "source": [
     "To use Responsible AI Dashboard, initialize a RAIInsights object upon which different components can be loaded.\n",
     "\n",
-    "RAIInsights accepts the model, the full dataset, the test dataset, the target feature string and the task type string as its arguments.",
+    "RAIInsights accepts the model, the full dataset, the test dataset, the target feature string and the task type string as its arguments.\n",
     "\n",
     "You may also create the `FeatureMetadata` container, identify any feature of your choice as the `identity_feature`, specify a list of strings of categorical feature names via the `categorical_features` parameter, and specify dropped features via the `dropped_features` parameter. The `FeatureMetadata` may also be passed into the `RAIInsights`."
    ]
diff --git a/notebooks/responsibleaidashboard/tabular/responsibleaidashboard-housing-decision-making.ipynb b/notebooks/responsibleaidashboard/tabular/responsibleaidashboard-housing-decision-making.ipynb
index 333e9a2b29..7beafba689 100644
--- a/notebooks/responsibleaidashboard/tabular/responsibleaidashboard-housing-decision-making.ipynb
+++ b/notebooks/responsibleaidashboard/tabular/responsibleaidashboard-housing-decision-making.ipynb
@@ -59,12 +59,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from packaging import version\n",
     "from raiutils.dataset import fetch_dataset\n",
+    "import sklearn\n",
     "from sklearn.pipeline import Pipeline\n",
     "from sklearn.impute import SimpleImputer\n",
     "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
     "from sklearn.compose import ColumnTransformer\n",
     "\n",
+    "# for older scikit-learn versions use sparse, for newer sparse_output:\n",
+    "if version.parse(sklearn.__version__) < version.parse('1.2'):\n",
+    "    ohe_params = {\"sparse\": False}\n",
+    "else:\n",
+    "    ohe_params = {\"sparse_output\": False}\n",
+    "\n",
     "def split_label(dataset, target_feature):\n",
     "    X = dataset.drop([target_feature], axis=1)\n",
     "    y = dataset[[target_feature]]\n",
@@ -83,7 +91,7 @@
     "    ])\n",
     "    cat_pipe = Pipeline([\n",
     "        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='?')),\n",
-    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))\n",
+    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', **ohe_params))\n",
     "    ])\n",
     "    feat_pipe = ColumnTransformer([\n",
     "        ('num_pipe', num_pipe, pipe_cfg['num_cols']),\n",
@@ -148,7 +156,7 @@
    "source": [
     "To use Responsible AI Dashboard, initialize a RAIInsights object upon which different components can be loaded.\n",
     "\n",
-    "RAIInsights accepts the model, the full dataset, the test dataset, the target feature string and the task type string as its arguments.",
+    "RAIInsights accepts the model, the full dataset, the test dataset, the target feature string and the task type string as its arguments.\n",
     "\n",
     "You may also create the `FeatureMetadata` container, identify any feature of your choice as the `identity_feature`, specify a list of strings of categorical feature names via the `categorical_features` parameter, and specify dropped features via the `dropped_features` parameter. The `FeatureMetadata` may also be passed into the `RAIInsights`."
    ]
diff --git a/rai_test_utils/rai_test_utils/models/sklearn/sklearn_model_utils.py b/rai_test_utils/rai_test_utils/models/sklearn/sklearn_model_utils.py
index 9866fad009..4721d4ce01 100644
--- a/rai_test_utils/rai_test_utils/models/sklearn/sklearn_model_utils.py
+++ b/rai_test_utils/rai_test_utils/models/sklearn/sklearn_model_utils.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pandas as pd
+import sklearn
+from packaging import version
 from sklearn import svm
 from sklearn.compose import ColumnTransformer
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
@@ -127,6 +129,11 @@ def conv(X):
             (conv(np.prod(x, axis=1)).reshape(-1, 1),
                 conv(np.prod(x, axis=1)**2).reshape(-1, 1))
         ))
+    # for older scikit-learn versions use sparse, for newer sparse_output:
+    if version.parse(sklearn.__version__) < version.parse('1.2'):
+        ohe_params = {"sparse": False}
+    else:
+        ohe_params = {"sparse_output": False}
     transformations = ColumnTransformer([
         ("age_fare_1", Pipeline(steps=[
             ('imputer', SimpleImputer(strategy='median')),
@@ -137,8 +144,8 @@ def conv(X):
         ("embarked", Pipeline(steps=[
             ("imputer",
                 SimpleImputer(strategy='constant', fill_value='missing')),
-            ("encoder", OneHotEncoder(sparse=False))]), ["embarked"]),
-        ("sex_pclass", OneHotEncoder(sparse=False), ["sex", "pclass"])
+            ("encoder", OneHotEncoder(**ohe_params))]), ["embarked"]),
+        ("sex_pclass", OneHotEncoder(**ohe_params), ["sex", "pclass"])
     ])
     clf = Pipeline(steps=[('preprocessor', transformations),
                           ('classifier',