Skip to content

Commit

Permalink
ElevenLabs-DotNet 3.4.0 (#69)
Browse files Browse the repository at this point in the history
- Added additional request properties for TextToSpeechRequest
  - `previous_text`, `next_text`, `previous_request_ids`, `next_request_ids`, `languageCode`, `withTimestamps`
- Added support for transcription timestamps
- Added support for language code in TextToSpeechRequest

---------

Co-authored-by: Milan Mikuš <[email protected]>
Co-authored-by: Milan Mikuš <[email protected]>
Co-authored-by: Tom Kail <[email protected]>
Co-authored-by: Tom Kail <[email protected]>
  • Loading branch information
5 people authored Nov 25, 2024
1 parent 41570cb commit ae0b152
Show file tree
Hide file tree
Showing 12 changed files with 355 additions and 81 deletions.
1 change: 1 addition & 0 deletions ElevenLabs-DotNet-Tests/ElevenLabs-DotNet-Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
<PackageReference Include="NUnit" Version="3.13.3" />
<PackageReference Include="NUnit3TestAdapter" Version="4.4.2" />
<PackageReference Include="coverlet.collector" Version="1.0.1" />
<PackageReference Include="System.Text.Json" Version="8.0.5" />
<ProjectReference Include="..\ElevenLabs-DotNet\ElevenLabs-DotNet.csproj" />
<ProjectReference Include="..\ElevenLabs-DotNet-Tests-Proxy\ElevenLabs-DotNet-Tests-Proxy.csproj" />
</ItemGroup>
Expand Down
77 changes: 72 additions & 5 deletions ElevenLabs-DotNet-Tests/TestFixture_04_TextToSpeechEndpoint.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Licensed under the MIT License. See LICENSE in the project root for license information.

using ElevenLabs.TextToSpeech;
using NUnit.Framework;
using System;
using System.Collections.Generic;
Expand All @@ -16,8 +17,8 @@ public async Task Test_01_TextToSpeech()
Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
var voice = Voices.Voice.Adam;
Assert.NotNull(voice);
var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync();
var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync("The quick brown fox jumps over the lazy dog.", voice, defaultVoiceSettings);
var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.");
var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request);
Assert.NotNull(voiceClip);
Console.WriteLine(voiceClip.Id);
}
Expand All @@ -28,20 +29,86 @@ public async Task Test_02_StreamTextToSpeech()
Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
var voice = (await ElevenLabsClient.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault();
Assert.NotNull(voice);
var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync();
var partialClips = new Queue<VoiceClip>();
var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync("The quick brown fox jumps over the lazy dog.", voice, defaultVoiceSettings,
partialClipCallback: async partialClip =>
var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", outputFormat: OutputFormat.PCM_24000);
var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request, async partialClip =>
{
Assert.IsNotNull(partialClip);
partialClips.Enqueue(partialClip);
await Task.CompletedTask;
});
Assert.NotNull(partialClips);
Assert.IsNotEmpty(partialClips);
Assert.NotNull(voiceClip);
Console.WriteLine(voiceClip.Id);
}

[Test]
public async Task Test_03_TextToSpeech_Transcription()
{
Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
var voice = Voices.Voice.Adam;
Assert.NotNull(voice);
var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", withTimestamps: true);
var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request);
Assert.NotNull(voiceClip);
Console.WriteLine(voiceClip.Id);
Assert.NotNull(voiceClip.TimestampedTranscriptCharacters);
Assert.IsNotEmpty(voiceClip.TimestampedTranscriptCharacters);
Console.WriteLine("| Character | Start Time | End Time |");
Console.WriteLine("| --------- | ---------- | -------- |");
foreach (var character in voiceClip.TimestampedTranscriptCharacters)
{
Console.WriteLine($"| {character.Character} | {character.StartTime} | {character.EndTime} |");
}
}

[Test]
public async Task Test_05_LanguageEnforced_TextToSpeech()
{
Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
var voice = Voices.Voice.Adam;
Assert.NotNull(voice);
var partialClips = new Queue<VoiceClip>();
var characters = new Queue<TimestampedTranscriptCharacter>();
Console.WriteLine("| Character | Start Time | End Time |");
Console.WriteLine("| --------- | ---------- | -------- |");
var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", outputFormat: OutputFormat.PCM_24000, withTimestamps: true);
var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request, async partialClip =>
{
await Task.CompletedTask;
partialClips.Enqueue(partialClip);
foreach (var character in partialClip.TimestampedTranscriptCharacters)
{
characters.Enqueue(character);
Console.WriteLine($"| {character.Character} | {character.StartTime} | {character.EndTime} |");
}
});
Assert.NotNull(partialClips);
Assert.NotNull(partialClips);
Assert.IsNotEmpty(partialClips);
Assert.NotNull(voiceClip);
Console.WriteLine(voiceClip.Id);
Assert.AreEqual(characters.ToArray(), voiceClip.TimestampedTranscriptCharacters);
}

[Test]
public async Task Test_TurboV2_5_LanguageEnforced_TextToSpeech()
{
Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
var voice = Voices.Voice.Adam;
Assert.NotNull(voice);
var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync();
var request = new TextToSpeechRequest(
voice: voice,
text: "Příliš žluťoučký kůň úpěl ďábelské ódy",
voiceSettings: defaultVoiceSettings,
model: Models.Model.TurboV2_5,
outputFormat: OutputFormat.MP3_44100_192,
languageCode: "cs");
var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request);
Assert.NotNull(voiceClip);
Console.WriteLine(voiceClip.Id);
}
}
}
5 changes: 4 additions & 1 deletion ElevenLabs-DotNet/Common/GeneratedClip.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ namespace ElevenLabs
{
public class GeneratedClip
{
internal GeneratedClip(string id, string text, ReadOnlyMemory<byte> clipData)
internal GeneratedClip(string id, string text, ReadOnlyMemory<byte> clipData, int sampleRate = 44100)
{
Id = id;
Text = text;
TextHash = $"{id}{text}".GenerateGuid().ToString();
ClipData = clipData;
SampleRate = sampleRate;
}

/// <summary>
Expand All @@ -34,5 +35,7 @@ internal GeneratedClip(string id, string text, ReadOnlyMemory<byte> clipData)
/// The ray clip data.
/// </summary>
public ReadOnlyMemory<byte> ClipData { get; }

public int SampleRate { get; }
}
}
42 changes: 42 additions & 0 deletions ElevenLabs-DotNet/Common/TimestampedTranscriptCharacter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Licensed under the MIT License. See LICENSE in the project root for license information.

using System.Text.Json.Serialization;

namespace ElevenLabs
{
/// <summary>
/// Represents timing information for a single character in the transcript
/// </summary>
public class TimestampedTranscriptCharacter
{
public TimestampedTranscriptCharacter() { }

internal TimestampedTranscriptCharacter(string character, double startTime, double endTime)
{
Character = character;
StartTime = startTime;
EndTime = endTime;
}

/// <summary>
/// The character being spoken
/// </summary>
[JsonInclude]
[JsonPropertyName("character")]
public string Character { get; private set; }

/// <summary>
/// The time in seconds when this character starts being spoken
/// </summary>
[JsonInclude]
[JsonPropertyName("character_start_times_seconds")]
public double StartTime { get; private set; }

/// <summary>
/// The time in seconds when this character finishes being spoken
/// </summary>
[JsonInclude]
[JsonPropertyName("character_end_times_seconds")]
public double EndTime { get; private set; }
}
}
5 changes: 4 additions & 1 deletion ElevenLabs-DotNet/Common/VoiceClip.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@ namespace ElevenLabs
{
public sealed class VoiceClip : GeneratedClip
{
internal VoiceClip(string id, string text, Voice voice, ReadOnlyMemory<byte> clipData) : base(id, text, clipData)
internal VoiceClip(string id, string text, Voice voice, ReadOnlyMemory<byte> clipData, int sampleRate = 44100)
: base(id, text, clipData, sampleRate)
{
Voice = voice;
}

public Voice Voice { get; }

public TimestampedTranscriptCharacter[] TimestampedTranscriptCharacters { get; internal init; }
}
}
7 changes: 6 additions & 1 deletion ElevenLabs-DotNet/ElevenLabs-DotNet.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,13 @@ All copyrights, trademarks, logos, and assets are the property of their respecti
<SignAssembly>false</SignAssembly>
<IncludeSymbols>true</IncludeSymbols>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<Version>3.1.0</Version>
<Version>3.4.0</Version>
<PackageReleaseNotes>
Version 3.4.0
- Added additional request properties for TextToSpeechRequest
- previous_text, next_text, previous_request_ids, next_request_ids, languageCode, withTimestamps
- Added support for transcription timestamps in TextToSpeechResponse
- Added support for language code in TextToSpeechRequest
Version 3.1.0
- Refactored TextToSpeechEndpoint endpoint to accept TextToSpeechRequest object
- Added text encoding options to TextToSpeechRequest
Expand Down
16 changes: 16 additions & 0 deletions ElevenLabs-DotNet/Extensions/Extensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Licensed under the MIT License. See LICENSE in the project root for license information.

namespace ElevenLabs.Extensions
{
public static class Extensions
{
public static int GetSampleRate(this OutputFormat format) => format switch
{
OutputFormat.PCM_16000 => 16000,
OutputFormat.PCM_22050 => 22050,
OutputFormat.PCM_24000 => 24000,
OutputFormat.PCM_44100 => 44100,
_ => 44100
};
}
}
37 changes: 37 additions & 0 deletions ElevenLabs-DotNet/TextToSpeech/Alignment.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Licensed under the MIT License. See LICENSE in the project root for license information.

using System.Text.Json.Serialization;

namespace ElevenLabs.TextToSpeech
{
internal sealed class Alignment
{
[JsonInclude]
[JsonPropertyName("characters")]
public string[] Characters { get; private set; }

[JsonInclude]
[JsonPropertyName("character_start_times_seconds")]
public double[] StartTimes { get; private set; }

[JsonInclude]
[JsonPropertyName("character_end_times_seconds")]
public double[] EndTimes { get; private set; }

public static implicit operator TimestampedTranscriptCharacter[](Alignment alignment)
{
if (alignment == null) { return null; }
var characters = alignment.Characters;
var startTimes = alignment.StartTimes;
var endTimes = alignment.EndTimes;
var timestampedTranscriptCharacters = new TimestampedTranscriptCharacter[characters.Length];

for (var i = 0; i < characters.Length; i++)
{
timestampedTranscriptCharacters[i] = new TimestampedTranscriptCharacter(characters[i], startTimes[i], endTimes[i]);
}

return timestampedTranscriptCharacters;
}
}
}
Loading

0 comments on commit ae0b152

Please sign in to comment.