RageAgainstThePixel · StephenHodgson · Nov 25, 2024 · Oct 27, 2024 · Nov 25, 2024 · Nov 25, 2024
@@ -13,6 +13,7 @@
         <PackageReference Include="NUnit" Version="3.13.3" />
         <PackageReference Include="NUnit3TestAdapter" Version="4.4.2" />
         <PackageReference Include="coverlet.collector" Version="1.0.1" />
+        <PackageReference Include="System.Text.Json" Version="8.0.5" />
         <ProjectReference Include="..\ElevenLabs-DotNet\ElevenLabs-DotNet.csproj" />
         <ProjectReference Include="..\ElevenLabs-DotNet-Tests-Proxy\ElevenLabs-DotNet-Tests-Proxy.csproj" />
     </ItemGroup>

@@ -1,5 +1,6 @@
 // Licensed under the MIT License. See LICENSE in the project root for license information.
 
+using ElevenLabs.TextToSpeech;
 using NUnit.Framework;
 using System;
 using System.Collections.Generic;
@@ -16,8 +17,8 @@ public async Task Test_01_TextToSpeech()
             Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
             var voice = Voices.Voice.Adam;
             Assert.NotNull(voice);
-            var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync();
-            var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync("The quick brown fox jumps over the lazy dog.", voice, defaultVoiceSettings);
+            var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.");
+            var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request);
             Assert.NotNull(voiceClip);
             Console.WriteLine(voiceClip.Id);
         }
@@ -28,20 +29,86 @@ public async Task Test_02_StreamTextToSpeech()
             Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
             var voice = (await ElevenLabsClient.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault();
             Assert.NotNull(voice);
-            var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync();
             var partialClips = new Queue<VoiceClip>();
-            var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync("The quick brown fox jumps over the lazy dog.", voice, defaultVoiceSettings,
-            partialClipCallback: async partialClip =>
+            var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", outputFormat: OutputFormat.PCM_24000);
+            var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request, async partialClip =>
             {
                 Assert.IsNotNull(partialClip);
                 partialClips.Enqueue(partialClip);
                 await Task.CompletedTask;
             });
+            Assert.NotNull(partialClips);
+            Assert.IsNotEmpty(partialClips);
+            Assert.NotNull(voiceClip);
+            Console.WriteLine(voiceClip.Id);
+        }
 
+        [Test]
+        public async Task Test_03_TextToSpeech_Transcription()
+        {
+            Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
+            var voice = Voices.Voice.Adam;
+            Assert.NotNull(voice);
+            var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", withTimestamps: true);
+            var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request);
+            Assert.NotNull(voiceClip);
+            Console.WriteLine(voiceClip.Id);
+            Assert.NotNull(voiceClip.TimestampedTranscriptCharacters);
+            Assert.IsNotEmpty(voiceClip.TimestampedTranscriptCharacters);
+            Console.WriteLine("| Character | Start Time | End Time |");
+            Console.WriteLine("| --------- | ---------- | -------- |");
+            foreach (var character in voiceClip.TimestampedTranscriptCharacters)
+            {
+                Console.WriteLine($"| {character.Character} | {character.StartTime} | {character.EndTime} |");
+            }
+        }
+
+        [Test]
+        public async Task Test_05_LanguageEnforced_TextToSpeech()
+        {
+            Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
+            var voice = Voices.Voice.Adam;
+            Assert.NotNull(voice);
+            var partialClips = new Queue<VoiceClip>();
+            var characters = new Queue<TimestampedTranscriptCharacter>();
+            Console.WriteLine("| Character | Start Time | End Time |");
+            Console.WriteLine("| --------- | ---------- | -------- |");
+            var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", outputFormat: OutputFormat.PCM_24000, withTimestamps: true);
+            var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request, async partialClip =>
+            {
+                await Task.CompletedTask;
+                partialClips.Enqueue(partialClip);
+                foreach (var character in partialClip.TimestampedTranscriptCharacters)
+                {
+                    characters.Enqueue(character);
+                    Console.WriteLine($"| {character.Character} | {character.StartTime} | {character.EndTime} |");
+                }
+            });
+            Assert.NotNull(partialClips);
             Assert.NotNull(partialClips);
             Assert.IsNotEmpty(partialClips);
             Assert.NotNull(voiceClip);
             Console.WriteLine(voiceClip.Id);
+            Assert.AreEqual(characters.ToArray(), voiceClip.TimestampedTranscriptCharacters);
+        }
+
+        [Test]
+        public async Task Test_TurboV2_5_LanguageEnforced_TextToSpeech()
+        {
+            Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
+            var voice = Voices.Voice.Adam;
+            Assert.NotNull(voice);
+            var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync();
+            var request = new TextToSpeechRequest(
+                voice: voice,
+                text: "Příliš žluťoučký kůň úpěl ďábelské ódy",
+                voiceSettings: defaultVoiceSettings,
+                model: Models.Model.TurboV2_5,
+                outputFormat: OutputFormat.MP3_44100_192,
+                languageCode: "cs");
+            var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request);
+            Assert.NotNull(voiceClip);
+            Console.WriteLine(voiceClip.Id);
         }
     }
 }
@@ -7,12 +7,13 @@ namespace ElevenLabs
 {
     public class GeneratedClip
     {
-        internal GeneratedClip(string id, string text, ReadOnlyMemory<byte> clipData)
+        internal GeneratedClip(string id, string text, ReadOnlyMemory<byte> clipData, int sampleRate = 44100)
         {
             Id = id;
             Text = text;
             TextHash = $"{id}{text}".GenerateGuid().ToString();
             ClipData = clipData;
+            SampleRate = sampleRate;
         }
 
         /// <summary>
@@ -34,5 +35,7 @@ internal GeneratedClip(string id, string text, ReadOnlyMemory<byte> clipData)
         /// The ray clip data.
         /// </summary>
         public ReadOnlyMemory<byte> ClipData { get; }
+
+        public int SampleRate { get; }
     }
 }
@@ -0,0 +1,42 @@
+// Licensed under the MIT License. See LICENSE in the project root for license information.
+
+using System.Text.Json.Serialization;
+
+namespace ElevenLabs
+{
+    /// <summary>
+    /// Represents timing information for a single character in the transcript
+    /// </summary>
+    public class TimestampedTranscriptCharacter
+    {
+        public TimestampedTranscriptCharacter() { }
+
+        internal TimestampedTranscriptCharacter(string character, double startTime, double endTime)
+        {
+            Character = character;
+            StartTime = startTime;
+            EndTime = endTime;
+        }
+
+        /// <summary>
+        /// The character being spoken
+        /// </summary>
+        [JsonInclude]
+        [JsonPropertyName("character")]
+        public string Character { get; private set; }
+
+        /// <summary>
+        /// The time in seconds when this character starts being spoken
+        /// </summary>
+        [JsonInclude]
+        [JsonPropertyName("character_start_times_seconds")]
+        public double StartTime { get; private set; }
+
+        /// <summary>
+        /// The time in seconds when this character finishes being spoken
+        /// </summary>
+        [JsonInclude]
+        [JsonPropertyName("character_end_times_seconds")]
+        public double EndTime { get; private set; }
+    }
+}
@@ -7,11 +7,14 @@ namespace ElevenLabs
 {
     public sealed class VoiceClip : GeneratedClip
     {
-        internal VoiceClip(string id, string text, Voice voice, ReadOnlyMemory<byte> clipData) : base(id, text, clipData)
+        internal VoiceClip(string id, string text, Voice voice, ReadOnlyMemory<byte> clipData, int sampleRate = 44100)
+            : base(id, text, clipData, sampleRate)
         {
             Voice = voice;
         }
 
         public Voice Voice { get; }
+
+        public TimestampedTranscriptCharacter[] TimestampedTranscriptCharacters { get; internal init; }
     }
 }
@@ -25,8 +25,13 @@ All copyrights, trademarks, logos, and assets are the property of their respecti
     <SignAssembly>false</SignAssembly>
     <IncludeSymbols>true</IncludeSymbols>
     <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
-    <Version>3.1.0</Version>
+    <Version>3.4.0</Version>
     <PackageReleaseNotes>
+Version 3.4.0
+- Added additional request properties for TextToSpeechRequest
+  - previous_text, next_text, previous_request_ids, next_request_ids, languageCode, withTimestamps
+- Added support for transcription timestamps in TextToSpeechResponse
+- Added support for language code in TextToSpeechRequest
 Version 3.1.0
 - Refactored TextToSpeechEndpoint endpoint to accept TextToSpeechRequest object
   - Added text encoding options to TextToSpeechRequest

@@ -0,0 +1,16 @@
+// Licensed under the MIT License. See LICENSE in the project root for license information.
+
+namespace ElevenLabs.Extensions
+{
+    public static class Extensions
+    {
+        public static int GetSampleRate(this OutputFormat format) => format switch
+        {
+            OutputFormat.PCM_16000 => 16000,
+            OutputFormat.PCM_22050 => 22050,
+            OutputFormat.PCM_24000 => 24000,
+            OutputFormat.PCM_44100 => 44100,
+            _ => 44100
+        };
+    }
+}
@@ -0,0 +1,37 @@
+// Licensed under the MIT License. See LICENSE in the project root for license information.
+
+using System.Text.Json.Serialization;
+
+namespace ElevenLabs.TextToSpeech
+{
+    internal sealed class Alignment
+    {
+        [JsonInclude]
+        [JsonPropertyName("characters")]
+        public string[] Characters { get; private set; }
+
+        [JsonInclude]
+        [JsonPropertyName("character_start_times_seconds")]
+        public double[] StartTimes { get; private set; }
+
+        [JsonInclude]
+        [JsonPropertyName("character_end_times_seconds")]
+        public double[] EndTimes { get; private set; }
+
+        public static implicit operator TimestampedTranscriptCharacter[](Alignment alignment)
+        {
+            if (alignment == null) { return null; }
+            var characters = alignment.Characters;
+            var startTimes = alignment.StartTimes;
+            var endTimes = alignment.EndTimes;
+            var timestampedTranscriptCharacters = new TimestampedTranscriptCharacter[characters.Length];
+
+            for (var i = 0; i < characters.Length; i++)
+            {
+                timestampedTranscriptCharacters[i] = new TimestampedTranscriptCharacter(characters[i], startTimes[i], endTimes[i]);
+            }
+
+            return timestampedTranscriptCharacters;
+        }
+    }
+}