Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add methods and constants for genai metrics #2524

Merged
merged 15 commits into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions raiwidgets/raiwidgets/responsibleai_dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,15 @@ def get_question_answering_metrics():
methods=["POST"]
)

def get_generative_text_metrics():
data = request.get_json(force=True)
return jsonify(self.input.get_generative_text_metrics(data))
self.add_url_rule(
get_generative_text_metrics,
'/get_generative_text_metrics',
methods=["POST"]
)

if hasattr(self._service, 'socketio'):
@self._service.socketio.on('handle_object_detection_json')
def handle_object_detection_json(od_json):
Expand All @@ -132,3 +141,8 @@ def handle_object_detection_json(od_json):
def handle_question_answering_json(qa_json):
qa_data = json.loads(qa_json['data'])
return self.input.get_question_answering_metrics(qa_data)

@self._service.socketio.on('handle_generative_text_json')
def handle_generative_text_json(gt_json):
gt_data = json.loads(gt_json['data'])
return self.input.get_generative_text_metrics(gt_data)
33 changes: 32 additions & 1 deletion raiwidgets/raiwidgets/responsibleai_dashboard_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def _prepare_filtered_error_analysis_data(self, features, filters,

def debug_ml(self, data):
try:
features = data[0]
features = data[0] # TODO: Remove prompt feature
kartikc727 marked this conversation as resolved.
Show resolved Hide resolved
filters = data[1]
composite_filters = data[2]
max_depth = data[3]
Expand Down Expand Up @@ -484,3 +484,34 @@ def get_question_answering_metrics(self, post_data):
"inner error: {}".format(e_str),
WidgetRequestResponseConstants.data: []
}

def get_generative_text_metrics(self, post_data):
"""Flask endpoint function to get Model Overview metrics
for the Generative Text scenario.

:param post_data: List of inputs in the order
[true_y, predicted_y, aggregate_method, class_name, iou_threshold].
:type post_data: List

:return: JSON/dict data response
:rtype: Dict[str, List]
"""
try:
selection_indexes = post_data[0]
generative_text_cache = post_data[1]
exp = self._analysis.compute_genai_metrics(
selection_indexes,
generative_text_cache
)
return {
WidgetRequestResponseConstants.data: exp
}
except Exception as e:
print(e)
traceback.print_exc()
e_str = _format_exception(e)
return {
WidgetRequestResponseConstants.error:
EXP_VIZ_ERR_MSG.format(e_str),
WidgetRequestResponseConstants.data: []
}
8 changes: 8 additions & 0 deletions responsibleai_text/responsibleai_text/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ class ModelTask(str, Enum):
QUESTION_ANSWERING = 'question_answering'
ENTAILMENT = 'entailment'
SUMMARIZATIONS = 'summarizations'
GENERATIVE_TEXT = 'generative_text'
GENERATIVE_TEXT_CHAT = 'generative_text_chat'
UNKNOWN = 'unknown'


Expand All @@ -34,3 +36,9 @@ class QuestionAnsweringFields(object):
QUESTION = "question"
CONTEXT = "context"
ANSWERS = "answers"


class GenerativeTextFields(object):
PROMPT = "prompt"
SYS_PROMPT = "sys_prompt"
RESPONSE = "response"
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas as pd
from ml_wrappers import wrap_model

from erroranalysis._internal.constants import ModelTask as ErrorAnalysisTask
from erroranalysis._internal.error_analyzer import ModelAnalyzer
from erroranalysis._internal.error_report import as_error_report
from responsibleai._tools.shared.state_directory_management import \
Expand All @@ -22,6 +23,7 @@
from responsibleai.managers.error_analysis_manager import as_error_config
from responsibleai_text.common.constants import ModelTask
from responsibleai_text.utils.feature_extractors import get_text_columns
from responsibleai_text.utils.genai_metrics.metrics import get_genai_metric

LABELS = 'labels'

Expand Down Expand Up @@ -83,6 +85,14 @@ def __init__(self, model, dataset, is_multilabel, task_type, classes=None):
self.predictions = self.model.predict(
self.dataset.loc[:, ['context', 'questions']])
self.predictions = np.array(self.predictions)
elif self.task_type == ModelTask.GENERATIVE_TEXT:
# TODO: Decide the final metric for error analysis
coherence = get_genai_metric(
'coherence',
predictions=self.model.predict(self.dataset),
references=dataset['prompt'],
wrapper_model=self.model)
self.predictions = np.array(coherence['scores'])
else:
raise ValueError("Unknown task type: {}".format(self.task_type))

Expand Down Expand Up @@ -193,9 +203,17 @@ def __init__(self, model: Any, dataset: pd.DataFrame,
task_type, index_classes)
if categorical_features is None:
categorical_features = []
if task_type == ModelTask.GENERATIVE_TEXT:
sup_task_type = ErrorAnalysisTask.REGRESSION
ext_dataset = ext_dataset.copy()
del ext_dataset['prompt']
ext_dataset['target_score'] = 5
kartikc727 marked this conversation as resolved.
Show resolved Hide resolved
target_column = 'target_score'
else:
sup_task_type = ErrorAnalysisTask.CLASSIFICATION
super(ErrorAnalysisManager, self).__init__(
index_predictor, ext_dataset, target_column,
classes, categorical_features)
classes, categorical_features, model_task=sup_task_type)

@staticmethod
def _create_index_predictor(model, dataset, target_column,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
from responsibleai_text.managers.explainer_manager import ExplainerManager
from responsibleai_text.utils.feature_extractors import (extract_features,
get_text_columns)
from responsibleai_text.utils.genai_metrics.metrics import \
get_genai_metric_mean

module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)
Expand Down Expand Up @@ -116,7 +118,8 @@ def __init__(self, model: Any, test: pd.DataFrame,
serializer: Optional[Any] = None,
maximum_rows_for_test: int = 5000,
feature_metadata: Optional[FeatureMetadata] = None,
text_column: Optional[Union[str, List]] = None):
text_column: Optional[Union[str, List]] = None,
eval_model: Any = None):
"""Creates an RAITextInsights object.

:param model: The model to compute RAI insights for.
Expand Down Expand Up @@ -148,6 +151,10 @@ def __init__(self, model: Any, test: pd.DataFrame,
If not provided, and there is additional feature metadata, then
an exception will be raised.
:type text_column: str or list[str]
:param eval_model: The model to use for evaluation with AI-assisted
metrics. If not provided, then the model passed in the model
parameter will be used.
:type eval_model: object
"""
# drop index as this can cause issues later like when copying
# target column below from test dataset to _ext_test_df
Expand All @@ -160,6 +167,10 @@ def __init__(self, model: Any, test: pd.DataFrame,
self._text_column = text_column
self._feature_metadata = feature_metadata
self._wrapped_model = wrap_model(model, test, task_type)
if eval_model is None:
self._eval_model = self._wrapped_model
else:
self._eval_model = wrap_model(eval_model, test, task_type)
self._validate_rai_insights_input_parameters(
model=self._wrapped_model, test=test,
target_column=target_column, task_type=task_type,
Expand Down Expand Up @@ -269,7 +280,9 @@ def _validate_model(self, model: Any, test: pd.DataFrame,
target_column, axis=1)
small_test_data = get_text_columns(small_test_data, text_column)
small_test_data = small_test_data.iloc[0]
if task_type != ModelTask.QUESTION_ANSWERING:
if task_type not in [
kartikc727 marked this conversation as resolved.
Show resolved Hide resolved
ModelTask.QUESTION_ANSWERING,
ModelTask.GENERATIVE_TEXT]:
small_test_data = small_test_data.tolist()
# Call the model
try:
Expand Down Expand Up @@ -319,7 +332,8 @@ def _validate_rai_insights_input_parameters(
ModelTask.SENTIMENT_ANALYSIS.value,
ModelTask.QUESTION_ANSWERING.value,
ModelTask.ENTAILMENT.value,
ModelTask.SUMMARIZATIONS.value
ModelTask.SUMMARIZATIONS.value,
ModelTask.GENERATIVE_TEXT.value,
]

if task_type not in valid_tasks:
Expand Down Expand Up @@ -362,6 +376,10 @@ def _validate_rai_insights_input_parameters(
if not target_columns_set.issubset(set(test.columns)):
raise UserConfigValidationException(
'The list of target_column(s) should be in test data')
elif (task_type == ModelTask.GENERATIVE_TEXT.value and
target_column is None):
# target column is optional for generative text
pass
else:
if target_column not in list(test.columns):
raise UserConfigValidationException(
Expand Down Expand Up @@ -514,6 +532,11 @@ def _get_test_text_data(self, is_classification_task):
dataset = self.test.drop(target_column, axis=1)
elif self.task_type == ModelTask.QUESTION_ANSWERING:
dataset = self.test.drop([self.target_column], axis=1)
elif self.task_type == ModelTask.GENERATIVE_TEXT:
if self.target_column is None:
dataset = self.test.copy()
else:
dataset = self.test.drop([self.target_column], axis=1)
else:
raise ValueError("Unknown task type: {}".format(self.task_type))
dataset = get_text_columns(dataset, self._text_column)
Expand Down Expand Up @@ -853,3 +876,71 @@ def compute_question_answering_metrics(
except ValueError:
all_cohort_metrics.append([0, 0, 0, 0, 0, 0])
return all_cohort_metrics

def compute_genai_metrics(
self,
selection_indexes,
genai_cache
):
dashboard_dataset = self.get_data().dataset
prompt_idx = dashboard_dataset.feature_names.index('prompt')
prompts = [feat[prompt_idx] for feat in dashboard_dataset.features]
true_y = dashboard_dataset.true_y
predicted_y = dashboard_dataset.predicted_y

all_cohort_metrics = []
for cohort_indices in selection_indexes:
cohort_metrics = dict()

if true_y is None:
true_y_cohort = None
else:
true_y_cohort = [true_y[cohort_index] for cohort_index
in cohort_indices]
predicted_y_cohort = [predicted_y[cohort_index] for cohort_index
in cohort_indices]
prompts_cohort = [prompts[cohort_index] for cohort_index
in cohort_indices]
try:
if true_y_cohort is not None:
exact_match = evaluate.load('exact_match')
cohort_metrics['exact_match'] = exact_match.compute(
predictions=predicted_y_cohort,
references=true_y_cohort)

cohort_metrics['coherence'] = get_genai_metric_mean(
'coherence',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)

if true_y_cohort is not None:
cohort_metrics['equivalence'] = get_genai_metric_mean(
'equivalence',
predictions=predicted_y_cohort,
references=prompts_cohort,
answers=true_y_cohort,
wrapper_model=self._eval_model)

cohort_metrics['fluency'] = get_genai_metric_mean(
'fluency',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)

cohort_metrics['groundedness'] = get_genai_metric_mean(
'groundedness',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)

cohort_metrics['relevance'] = get_genai_metric_mean(
'relevance',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)

all_cohort_metrics.append(cohort_metrics)
except ValueError:
all_cohort_metrics.append({})
return all_cohort_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from tqdm import tqdm

from nlp_feature_extractors import attribute_extractors as exts
from responsibleai_text.common.constants import (ModelTask,
from responsibleai_text.common.constants import (GenerativeTextFields,
ModelTask,
QuestionAnsweringFields)

nlp = None
Expand Down Expand Up @@ -60,6 +61,9 @@ def extract_features(text_dataset: pd.DataFrame,
feature_names.append(prefix + "maximum_parse_tree_depth")
feature_names.append("question_type")
feature_names.append("context_overlap")
elif task_type == ModelTask.GENERATIVE_TEXT:
start_meta_index = 0
feature_names = base_feature_names
else:
raise ValueError("Unknown task type: {}".format(task_type))
# copy over the metadata column names
Expand Down Expand Up @@ -96,6 +100,19 @@ def extract_features(text_dataset: pd.DataFrame,
context_overlap = get_context_overlap(context=context,
question=question)
extracted_features.append(context_overlap)
# append all other metadata features
append_metadata_values(start_meta_index, text_dataset, i,
extracted_features, has_dropped_features,
dropped_features, column_names)
results.append(extracted_features)
elif task_type == ModelTask.GENERATIVE_TEXT:
for i, row in tqdm(text_features.iterrows(),
desc='feature extraction'):
extracted_features = []
add_extracted_features_for_sentence(
row[GenerativeTextFields.PROMPT], extracted_features,
task_type)

# append all other metadata features
append_metadata_values(start_meta_index, text_dataset, i,
extracted_features, has_dropped_features,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Contains the GenAI metrics."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Contains the implementation of various metrics for GenAI."""
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def _compute_metric(template, logger, wrapper_model, **kwargs):
templated_ques = format_str(template, **kwargs)

inp = pd.DataFrame({
'questions': templated_ques,
'prompt': templated_ques,
'sys_prompt': _SYS_PROMPT})

responses = wrapper_model.predict(inp)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@
This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
Some examples of valid responses are:
kartikc727 marked this conversation as resolved.
Show resolved Hide resolved
1
2
5
Some examples of invalid responses are:
1/5
1.5
3.0
5 stars
QUESTION:
{question}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@
This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
Some examples of valid responses are:
1
2
5
Some examples of invalid responses are:
1/5
1.5
3.0
5 stars
QUESTION:
{question}
Expand Down
Loading
Loading