Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding evaluations for the LLM AI implementation of ChatState #357

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
c954f2f
Adding chat state evals
mahomedalid May 7, 2024
201f9f3
Adding tests
mahomedalid May 7, 2024
96b2b6b
Adding LLM evaluations
mahomedalid May 7, 2024
b09b323
Renaming the workflow
mahomedalid May 7, 2024
4d9d126
Renaming the workflow
mahomedalid May 7, 2024
85d1289
Adding option for ollama
mahomedalid May 7, 2024
db2a31d
Adding nullcheck
mahomedalid May 7, 2024
622a5e5
Adding nullcheck
mahomedalid May 7, 2024
f4c7412
Adding nullcheck
mahomedalid May 7, 2024
857efbd
Adding nullcheck
mahomedalid May 7, 2024
e19443f
Fixing tests, omitting the adversarial one
mahomedalid May 7, 2024
284ec43
Adding batch evaluations
mahomedalid May 10, 2024
383d030
Merge branch 'may-2024-ai-updates' into features/chat-state-evaluations
mahomedalid May 11, 2024
fec3f02
Adding processors
mahomedalid May 14, 2024
29b68c4
Moving to AI instead of OPENAI
mahomedalid May 14, 2024
119a899
Merge branch 'features/chat-state-evaluations' of https://github.com/…
mahomedalid May 14, 2024
69da4f9
Moving to AI instead of OPENAI
mahomedalid May 14, 2024
3c01bc6
Adding batch evals
mahomedalid May 14, 2024
a75395f
Adding batch evals
mahomedalid May 14, 2024
bd9b1f1
Adding batch evals
mahomedalid May 14, 2024
f9c1b2d
Adding issue with relevance
mahomedalid May 15, 2024
cf94ac2
Updating batch evals
mahomedalid May 15, 2024
e5d32bd
Adding batch eval samples
mahomedalid May 15, 2024
554c2fa
Adding batch eval samples
mahomedalid May 15, 2024
b0fc1eb
Enabling Azure Metrics
mahomedalid May 15, 2024
3d78753
Adding connectrion strings messages
mahomedalid May 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .github/workflows/pr-validation-ai.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: eShop Pull Request Validation - AI LLM Evals

on:
workflow_dispatch:
pull_request:
paths-ignore:
- '**.md'
- 'src/ClientApp/**'
- 'test/ClientApp.UnitTests/**'
- '.github/workflows/pr-validation-maui.yml'
push:
branches:
- main
paths-ignore:
- '**.md'
- 'src/ClientApp/**'
- 'test/ClientApp.UnitTests/**'
- '.github/workflows/pr-validation-maui.yml'

env:
ESHOP_AI_MODEL: "phi3"
ESHOP_TESTS_AI_COMPLETION_TYPE: "azureopenai"

jobs:
eval:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup .NET (global.json)
uses: actions/setup-dotnet@v3
- name: Install ollama
run: curl -fsSL https://ollama.com/install.sh | sh
- name: Update Workloads
run: dotnet workload update
- name: Install Workloads
run: dotnet workload install aspire
- name: Build
run: dotnet build eShop.LLMEvals.slnf
- name: Pull model
run: ollama pull ${{ env.ESHOP_AI_MODEL }}
- name: Test
env:
ESHOP_TESTS_AI_COMPLETION_TYPE: ${{ env.ESHOP_TESTS_AI_COMPLETION_TYPE }}
AZURE_AI_MODEL: ${{ secrets.AZURE_AI_MODEL }}
AZURE_AI_ENDPOINT: ${{ secrets.AZURE_AI_ENDPOINT }}
AZURE_AI_KEY: ${{ secrets.AZURE_AI_KEY }}
ESHOP_AI_ENDPOINT: "http://localhost:11434"
ESHOP_AI_MODEL: ${{ env.ESHOP_AI_MODEL }}
ESHOP_AI_KEY: "api"
run: dotnet test eShop.LLMEvals.slnf
10 changes: 8 additions & 2 deletions Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
<PackageVersion Include="Aspire.RabbitMQ.Client" Version="$(AspireVersion)" />
<PackageVersion Include="Aspire.StackExchange.Redis" Version="$(AspireVersion)" />
<PackageVersion Include="Aspire.Azure.AI.OpenAI" Version="$(AspireVersion)" />
<PackageVersion Include="Azure.Monitor.OpenTelemetry.Exporter" Version="1.2.0" />
<PackageVersion Include="Microsoft.ApplicationInsights.WorkerService" Version="2.22.0" />
<PackageVersion Include="Microsoft.Extensions.ServiceDiscovery" Version="$(AspireVersion)" />
<PackageVersion Include="Microsoft.Extensions.ServiceDiscovery.Yarp" Version="$(AspireVersion)" />
<!-- Version together with ASP.NET -->
Expand Down Expand Up @@ -50,10 +52,13 @@
<PackageVersion Include="Microsoft.Extensions.Options.ConfigurationExtensions" Version="8.0.0" />
<PackageVersion Include="Microsoft.Extensions.Options" Version="8.0.2" />
<PackageVersion Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="8.0.1" />
<PackageVersion Include="Microsoft.Extensions.DependencyInjection" Version="8.0.0" />
<PackageVersion Include="Microsoft.Extensions.Logging.Console" Version="8.0.0" />
<!-- Xabaril packages -->
<PackageVersion Include="AspNetCore.HealthChecks.Uris" Version="8.0.1" />
<!-- AI -->
<PackageVersion Include="Microsoft.SemanticKernel" Version="1.10.0" />
<PackageVersion Include="Microsoft.SemanticKernel" Version="1.11.1" />
<PackageVersion Include="Microsoft.SemanticKernel.Core" Version="1.11.1" />
<PackageVersion Include="Microsoft.SemanticKernel.Connectors.Onnx" Version="1.10.0-alpha" />
<!-- Open Telemetry -->
<PackageVersion Include="OpenTelemetry.Exporter.OpenTelemetryProtocol" Version="1.8.1" />
Expand Down Expand Up @@ -84,9 +89,10 @@
<PackageVersion Include="Microsoft.Web.LibraryManager.Build" Version="2.1.175" />
<PackageVersion Include="Polly.Core" Version="8.3.1" />
<PackageVersion Include="Swashbuckle.AspNetCore" Version="6.5.0" />
<PackageVersion Include="System.CommandLine" Version="2.0.0-beta4.22272.1" />
<PackageVersion Include="System.Reflection.TypeExtensions" Version="4.7.0" />
<PackageVersion Include="xunit" Version="2.8.0" />
<PackageVersion Include="xunit.runner.visualstudio" Version="2.8.0" />
<PackageVersion Include="Yarp.ReverseProxy" Version="2.1.0" />
</ItemGroup>
</Project>
</Project>
12 changes: 12 additions & 0 deletions eShop.LLMEvals.slnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

{
"solution": {
"path": "eShop.sln",
"projects": [
"src\\WebApp\\WebApp.csproj",
"src\\eShop.ServiceDefaults\\eShop.ServiceDefaults.csproj",
"src\\eShop.AppHost\\eShop.AppHost.csproj",
"tests\\AI.UnitTests\\AI.UnitTests.csproj"
]
}
}
21 changes: 21 additions & 0 deletions eShop.sln
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ClientApp", "src\ClientApp\
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ClientApp.UnitTests", "tests\ClientApp.UnitTests\ClientApp.UnitTests.csproj", "{02878FFB-F4DA-4996-B4A6-308851A837C6}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AI.UnitTests", "tests\AI.UnitTests\AI.UnitTests.csproj", "{94B500BE-D1EE-4DFE-BC0D-93D0AABC8203}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.SKEval", "src\Microsoft.SKEval\Microsoft.SKEval.csproj", "{E926E64C-2149-427D-99B4-743E16EBC2A1}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AI.BatchEvals", "tests\AI.BatchEvals\AI.BatchEvals.csproj", "{B70C3934-8F9A-4F5A-9705-C255F57911D3}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -175,6 +181,18 @@ Global
{02878FFB-F4DA-4996-B4A6-308851A837C6}.Debug|Any CPU.Build.0 = Debug|Any CPU
{02878FFB-F4DA-4996-B4A6-308851A837C6}.Release|Any CPU.ActiveCfg = Release|Any CPU
{02878FFB-F4DA-4996-B4A6-308851A837C6}.Release|Any CPU.Build.0 = Release|Any CPU
{94B500BE-D1EE-4DFE-BC0D-93D0AABC8203}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{94B500BE-D1EE-4DFE-BC0D-93D0AABC8203}.Debug|Any CPU.Build.0 = Debug|Any CPU
{94B500BE-D1EE-4DFE-BC0D-93D0AABC8203}.Release|Any CPU.ActiveCfg = Release|Any CPU
{94B500BE-D1EE-4DFE-BC0D-93D0AABC8203}.Release|Any CPU.Build.0 = Release|Any CPU
{E926E64C-2149-427D-99B4-743E16EBC2A1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{E926E64C-2149-427D-99B4-743E16EBC2A1}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E926E64C-2149-427D-99B4-743E16EBC2A1}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E926E64C-2149-427D-99B4-743E16EBC2A1}.Release|Any CPU.Build.0 = Release|Any CPU
{B70C3934-8F9A-4F5A-9705-C255F57911D3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{B70C3934-8F9A-4F5A-9705-C255F57911D3}.Debug|Any CPU.Build.0 = Debug|Any CPU
{B70C3934-8F9A-4F5A-9705-C255F57911D3}.Release|Any CPU.ActiveCfg = Release|Any CPU
{B70C3934-8F9A-4F5A-9705-C255F57911D3}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -205,6 +223,9 @@ Global
{66275483-5364-42F9-B7E6-410E6A1B5ECF} = {932D8224-11F6-4D07-B109-DA28AD288A63}
{938803BB-4F6F-4108-BDD1-2AD0180BBDC1} = {932D8224-11F6-4D07-B109-DA28AD288A63}
{02878FFB-F4DA-4996-B4A6-308851A837C6} = {A857AD10-40FF-4303-BEC2-FF1C58D5735E}
{94B500BE-D1EE-4DFE-BC0D-93D0AABC8203} = {A857AD10-40FF-4303-BEC2-FF1C58D5735E}
{E926E64C-2149-427D-99B4-743E16EBC2A1} = {A857AD10-40FF-4303-BEC2-FF1C58D5735E}
{B70C3934-8F9A-4F5A-9705-C255F57911D3} = {A857AD10-40FF-4303-BEC2-FF1C58D5735E}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {25728519-5F0F-4973-8A64-0A81EB4EA8D9}
Expand Down
28 changes: 28 additions & 0 deletions src/Microsoft.SKEval/AddLLMEvalExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
using Microsoft.SKEval;
using Microsoft.Extensions.Configuration;
using Microsoft.SemanticKernel;
using OpenTelemetry;
using OpenTelemetry.Metrics;
using OpenTelemetry.Trace;
using System.Diagnostics.Metrics;
using System.Text;

namespace Microsoft.SKEval.Metrics;

public static class AddLLMEvalExtensions
{

public static MeterProviderBuilder AddLLMEvalMetrics(
this MeterProviderBuilder builder,
IList<IEvaluator<int>> intEvaluators)
{
foreach (var evaluator in intEvaluators)
{
builder.AddView(
instrumentName: $"{evaluator.Id.ToLowerInvariant()}.score",
new ExplicitBucketHistogramConfiguration { Boundaries = [1, 2, 3, 4, 5] });
}

return builder;
}
}
186 changes: 186 additions & 0 deletions src/Microsoft.SKEval/BatchEval.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
using Microsoft.SKEval.Metrics;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
using Microsoft.SemanticKernel;
using OpenTelemetry;
using OpenTelemetry.Metrics;
using OpenTelemetry.Trace;
using System.Diagnostics.Metrics;
using System.Text;

namespace Microsoft.SKEval;

public class BatchEval<T>
{
public const string MeterId = "Microsoft.SKEval";

IList<IEvaluator<int>> intEvaluators = new List<IEvaluator<int>>();

IList<IEvaluator<bool>> boolEvaluators = new List<IEvaluator<bool>>();

string? fileName;

IInputProcessor<T>? inputProcessor;

IOutputProcessor? outputProcessor;

public string? OtlpEndpoint { get; set; } = default!;

private ILogger? logger;

public BatchEval(ILogger? logger = null)
{
this.logger = logger;
}

public BatchEval<T> WithInputProcessor(IInputProcessor<T> inputProcessor)
{
this.inputProcessor = inputProcessor;
return this;
}

public BatchEval<T> WithOutputProcessor(IOutputProcessor outputProcessor)
{
this.outputProcessor = outputProcessor;
return this;
}

public BatchEval<T> WithCsvOutputProcessor()
{
return WithOutputProcessor(new CsvOutputProcessor());
}

public BatchEval<T> AddEvaluator(IEvaluator<int> evaluator)
{
intEvaluators.Add(evaluator);
return this;
}

public BatchEval<T> AddEvaluator(IEvaluator<bool> evaluator)
{
boolEvaluators.Add(evaluator);
return this;
}

public async Task<BatchEvalResults> Run()
{
return await ProcessUserInputFile();
}

public BatchEval<T> WithJsonl(string fileName)
{
this.fileName = fileName;
return this;
}

private async Task<BatchEvalResults> ProcessUserInputFile()
{
var meter = new Meter(MeterId);

const int BufferSize = 128;
using (var fileStream = File.OpenRead(fileName!))
using (var streamReader = new StreamReader(fileStream, Encoding.UTF8, true, BufferSize))
{
var results = await ProcessFileLines(streamReader, meter);
return results;
}
}

private EvalMetrics InitCounters(Meter meter)
{
var evalMetrics = new EvalMetrics() {
PromptCounter = meter.CreateCounter<int>($"llmeval.prompt.counter")
};

foreach (var evaluator in intEvaluators)
{
var histogram = meter.CreateHistogram<int>($"llmeval.{evaluator.Id.ToLowerInvariant()}.score");
evalMetrics.ScoreHistograms.Add(evaluator.Id, histogram);
}

foreach (var evaluator in boolEvaluators)
{
evalMetrics.BooleanCounters.Add(
$"llmeval.{evaluator.Id.ToLowerInvariant()}.failure",
meter.CreateCounter<int>($"{evaluator.Id.ToLowerInvariant()}.failure"));

evalMetrics.BooleanCounters.Add(
$"llmeval.{evaluator.Id.ToLowerInvariant()}.success",
meter.CreateCounter<int>($"{evaluator.Id.ToLowerInvariant()}.success"));
}

return evalMetrics;
}

private async Task<BatchEvalResults> ProcessFileLines(
StreamReader streamReader,
Meter meter)
{
var evalMetrics = InitCounters(meter);

outputProcessor?.Init();

var results = new BatchEvalResults();

string? line;
while ((line = await streamReader.ReadLineAsync()) != null)
{
var userInput = System.Text.Json.JsonSerializer.Deserialize<T>(line);

var modelOutput = await inputProcessor!.Process(userInput!);

var evalOutput = new BatchEvalPromptOutput()
{
Subject = modelOutput
};

logger?.LogDebug($"QUESTION: {modelOutput.Input}");
logger?.LogDebug($"ANSWER: {modelOutput.Output}");

evalMetrics.PromptCounter.Add(1);

foreach (var evaluator in intEvaluators)
{
var score = await evaluator.Eval(modelOutput);

logger?.LogDebug($"EVAL: {evaluator.Id.ToLowerInvariant()} SCORE: {score}");

evalMetrics.ScoreHistograms[evaluator.Id.ToLowerInvariant()].Record(score);
evalOutput.Results.Add(evaluator.Id.ToLowerInvariant(), score);
}

foreach (var evaluator in boolEvaluators)
{
var evalResult = await evaluator.Eval(modelOutput);

logger?.LogDebug($"EVAL: {evaluator.Id.ToLowerInvariant()} RESULT: {evalResult}");

evalOutput.Results.Add(evaluator.Id.ToLowerInvariant(), evalResult);

if (evalResult) {
evalMetrics.BooleanCounters[$"{evaluator.Id.ToLowerInvariant()}.success"].Add(1);
} else {
evalMetrics.BooleanCounters[$"{evaluator.Id.ToLowerInvariant()}.failure"].Add(1);
}
}

outputProcessor?.Process(evalOutput);

results.EvalResults.Add(evalOutput);
}

return results;
}

public MeterProviderBuilder CreateMeterProviderBuilder()
{
var builder = Sdk.CreateMeterProviderBuilder()
.AddMeter(MeterId);

builder.AddLLMEvalMetrics(intEvaluators);

builder.AddMeter("Microsoft.SemanticKernel*");

return builder;
}
}
8 changes: 8 additions & 0 deletions src/Microsoft.SKEval/BatchEvalPromptOutput.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace Microsoft.SKEval;

public class BatchEvalPromptOutput
{
public ModelOutput Subject { get; set; } = default!;

public IDictionary<string, object> Results { get; set; } = new Dictionary<string, object>();
}
Loading
Loading