diff --git a/ElevenLabs-DotNet-Tests/ElevenLabs-DotNet-Tests.csproj b/ElevenLabs-DotNet-Tests/ElevenLabs-DotNet-Tests.csproj index d96b7fe..4369072 100644 --- a/ElevenLabs-DotNet-Tests/ElevenLabs-DotNet-Tests.csproj +++ b/ElevenLabs-DotNet-Tests/ElevenLabs-DotNet-Tests.csproj @@ -13,6 +13,7 @@ + diff --git a/ElevenLabs-DotNet-Tests/TestFixture_04_TextToSpeechEndpoint.cs b/ElevenLabs-DotNet-Tests/TestFixture_04_TextToSpeechEndpoint.cs index 545f487..d855b39 100644 --- a/ElevenLabs-DotNet-Tests/TestFixture_04_TextToSpeechEndpoint.cs +++ b/ElevenLabs-DotNet-Tests/TestFixture_04_TextToSpeechEndpoint.cs @@ -1,5 +1,6 @@ // Licensed under the MIT License. See LICENSE in the project root for license information. +using ElevenLabs.TextToSpeech; using NUnit.Framework; using System; using System.Collections.Generic; @@ -16,8 +17,8 @@ public async Task Test_01_TextToSpeech() Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint); var voice = Voices.Voice.Adam; Assert.NotNull(voice); - var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync(); - var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync("The quick brown fox jumps over the lazy dog.", voice, defaultVoiceSettings); + var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog."); + var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request); Assert.NotNull(voiceClip); Console.WriteLine(voiceClip.Id); } @@ -28,20 +29,86 @@ public async Task Test_02_StreamTextToSpeech() Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint); var voice = (await ElevenLabsClient.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault(); Assert.NotNull(voice); - var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync(); var partialClips = new Queue(); - var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync("The quick brown fox jumps over the lazy dog.", voice, defaultVoiceSettings, - partialClipCallback: async partialClip => + var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", outputFormat: OutputFormat.PCM_24000); + var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request, async partialClip => { Assert.IsNotNull(partialClip); partialClips.Enqueue(partialClip); await Task.CompletedTask; }); + Assert.NotNull(partialClips); + Assert.IsNotEmpty(partialClips); + Assert.NotNull(voiceClip); + Console.WriteLine(voiceClip.Id); + } + [Test] + public async Task Test_03_TextToSpeech_Transcription() + { + Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint); + var voice = Voices.Voice.Adam; + Assert.NotNull(voice); + var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", withTimestamps: true); + var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request); + Assert.NotNull(voiceClip); + Console.WriteLine(voiceClip.Id); + Assert.NotNull(voiceClip.TimestampedTranscriptCharacters); + Assert.IsNotEmpty(voiceClip.TimestampedTranscriptCharacters); + Console.WriteLine("| Character | Start Time | End Time |"); + Console.WriteLine("| --------- | ---------- | -------- |"); + foreach (var character in voiceClip.TimestampedTranscriptCharacters) + { + Console.WriteLine($"| {character.Character} | {character.StartTime} | {character.EndTime} |"); + } + } + + [Test] + public async Task Test_05_LanguageEnforced_TextToSpeech() + { + Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint); + var voice = Voices.Voice.Adam; + Assert.NotNull(voice); + var partialClips = new Queue(); + var characters = new Queue(); + Console.WriteLine("| Character | Start Time | End Time |"); + Console.WriteLine("| --------- | ---------- | -------- |"); + var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", outputFormat: OutputFormat.PCM_24000, withTimestamps: true); + var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request, async partialClip => + { + await Task.CompletedTask; + partialClips.Enqueue(partialClip); + foreach (var character in partialClip.TimestampedTranscriptCharacters) + { + characters.Enqueue(character); + Console.WriteLine($"| {character.Character} | {character.StartTime} | {character.EndTime} |"); + } + }); + Assert.NotNull(partialClips); Assert.NotNull(partialClips); Assert.IsNotEmpty(partialClips); Assert.NotNull(voiceClip); Console.WriteLine(voiceClip.Id); + Assert.AreEqual(characters.ToArray(), voiceClip.TimestampedTranscriptCharacters); + } + + [Test] + public async Task Test_TurboV2_5_LanguageEnforced_TextToSpeech() + { + Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint); + var voice = Voices.Voice.Adam; + Assert.NotNull(voice); + var defaultVoiceSettings = await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync(); + var request = new TextToSpeechRequest( + voice: voice, + text: "Příliš žluťoučký kůň úpěl ďábelské ódy", + voiceSettings: defaultVoiceSettings, + model: Models.Model.TurboV2_5, + outputFormat: OutputFormat.MP3_44100_192, + languageCode: "cs"); + var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request); + Assert.NotNull(voiceClip); + Console.WriteLine(voiceClip.Id); } } } \ No newline at end of file diff --git a/ElevenLabs-DotNet/Common/GeneratedClip.cs b/ElevenLabs-DotNet/Common/GeneratedClip.cs index d450ee9..d681c8d 100644 --- a/ElevenLabs-DotNet/Common/GeneratedClip.cs +++ b/ElevenLabs-DotNet/Common/GeneratedClip.cs @@ -7,12 +7,13 @@ namespace ElevenLabs { public class GeneratedClip { - internal GeneratedClip(string id, string text, ReadOnlyMemory clipData) + internal GeneratedClip(string id, string text, ReadOnlyMemory clipData, int sampleRate = 44100) { Id = id; Text = text; TextHash = $"{id}{text}".GenerateGuid().ToString(); ClipData = clipData; + SampleRate = sampleRate; } /// @@ -34,5 +35,7 @@ internal GeneratedClip(string id, string text, ReadOnlyMemory clipData) /// The ray clip data. /// public ReadOnlyMemory ClipData { get; } + + public int SampleRate { get; } } } \ No newline at end of file diff --git a/ElevenLabs-DotNet/Common/TimestampedTranscriptCharacter.cs b/ElevenLabs-DotNet/Common/TimestampedTranscriptCharacter.cs new file mode 100644 index 0000000..ff4b85a --- /dev/null +++ b/ElevenLabs-DotNet/Common/TimestampedTranscriptCharacter.cs @@ -0,0 +1,42 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System.Text.Json.Serialization; + +namespace ElevenLabs +{ + /// + /// Represents timing information for a single character in the transcript + /// + public class TimestampedTranscriptCharacter + { + public TimestampedTranscriptCharacter() { } + + internal TimestampedTranscriptCharacter(string character, double startTime, double endTime) + { + Character = character; + StartTime = startTime; + EndTime = endTime; + } + + /// + /// The character being spoken + /// + [JsonInclude] + [JsonPropertyName("character")] + public string Character { get; private set; } + + /// + /// The time in seconds when this character starts being spoken + /// + [JsonInclude] + [JsonPropertyName("character_start_times_seconds")] + public double StartTime { get; private set; } + + /// + /// The time in seconds when this character finishes being spoken + /// + [JsonInclude] + [JsonPropertyName("character_end_times_seconds")] + public double EndTime { get; private set; } + } +} diff --git a/ElevenLabs-DotNet/Common/VoiceClip.cs b/ElevenLabs-DotNet/Common/VoiceClip.cs index 8f75dbd..72986a1 100644 --- a/ElevenLabs-DotNet/Common/VoiceClip.cs +++ b/ElevenLabs-DotNet/Common/VoiceClip.cs @@ -7,11 +7,14 @@ namespace ElevenLabs { public sealed class VoiceClip : GeneratedClip { - internal VoiceClip(string id, string text, Voice voice, ReadOnlyMemory clipData) : base(id, text, clipData) + internal VoiceClip(string id, string text, Voice voice, ReadOnlyMemory clipData, int sampleRate = 44100) + : base(id, text, clipData, sampleRate) { Voice = voice; } public Voice Voice { get; } + + public TimestampedTranscriptCharacter[] TimestampedTranscriptCharacters { get; internal init; } } } diff --git a/ElevenLabs-DotNet/ElevenLabs-DotNet.csproj b/ElevenLabs-DotNet/ElevenLabs-DotNet.csproj index 08a3542..197d6db 100644 --- a/ElevenLabs-DotNet/ElevenLabs-DotNet.csproj +++ b/ElevenLabs-DotNet/ElevenLabs-DotNet.csproj @@ -25,8 +25,13 @@ All copyrights, trademarks, logos, and assets are the property of their respecti false true true - 3.1.0 + 3.4.0 +Version 3.4.0 +- Added additional request properties for TextToSpeechRequest + - previous_text, next_text, previous_request_ids, next_request_ids, languageCode, withTimestamps +- Added support for transcription timestamps in TextToSpeechResponse +- Added support for language code in TextToSpeechRequest Version 3.1.0 - Refactored TextToSpeechEndpoint endpoint to accept TextToSpeechRequest object - Added text encoding options to TextToSpeechRequest diff --git a/ElevenLabs-DotNet/Extensions/Extensions.cs b/ElevenLabs-DotNet/Extensions/Extensions.cs new file mode 100644 index 0000000..c6bf1a1 --- /dev/null +++ b/ElevenLabs-DotNet/Extensions/Extensions.cs @@ -0,0 +1,16 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +namespace ElevenLabs.Extensions +{ + public static class Extensions + { + public static int GetSampleRate(this OutputFormat format) => format switch + { + OutputFormat.PCM_16000 => 16000, + OutputFormat.PCM_22050 => 22050, + OutputFormat.PCM_24000 => 24000, + OutputFormat.PCM_44100 => 44100, + _ => 44100 + }; + } +} diff --git a/ElevenLabs-DotNet/TextToSpeech/Alignment.cs b/ElevenLabs-DotNet/TextToSpeech/Alignment.cs new file mode 100644 index 0000000..d645415 --- /dev/null +++ b/ElevenLabs-DotNet/TextToSpeech/Alignment.cs @@ -0,0 +1,37 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System.Text.Json.Serialization; + +namespace ElevenLabs.TextToSpeech +{ + internal sealed class Alignment + { + [JsonInclude] + [JsonPropertyName("characters")] + public string[] Characters { get; private set; } + + [JsonInclude] + [JsonPropertyName("character_start_times_seconds")] + public double[] StartTimes { get; private set; } + + [JsonInclude] + [JsonPropertyName("character_end_times_seconds")] + public double[] EndTimes { get; private set; } + + public static implicit operator TimestampedTranscriptCharacter[](Alignment alignment) + { + if (alignment == null) { return null; } + var characters = alignment.Characters; + var startTimes = alignment.StartTimes; + var endTimes = alignment.EndTimes; + var timestampedTranscriptCharacters = new TimestampedTranscriptCharacter[characters.Length]; + + for (var i = 0; i < characters.Length; i++) + { + timestampedTranscriptCharacters[i] = new TimestampedTranscriptCharacter(characters[i], startTimes[i], endTimes[i]); + } + + return timestampedTranscriptCharacters; + } + } +} diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs index bfe7098..6b387b3 100644 --- a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs @@ -28,42 +28,7 @@ public TextToSpeechEndpoint(ElevenLabsClient client) : base(client) { } protected override string Root => "text-to-speech"; - /// - /// Converts text into speech using a voice of your choice and returns audio. - /// - /// - /// Text input to synthesize speech for. Maximum 5000 characters. - /// - /// - /// to use. - /// - /// - /// Optional, that will override the default settings in . - /// - /// - /// Optional, to use. Defaults to . - /// - /// - /// Output format of the generated audio.
- /// Defaults to - /// - /// - /// Optional, You can turn on latency optimizations at some cost of quality. - /// The best possible final latency varies by model.
- /// Possible values:
- /// 0 - default mode (no latency optimizations)
- /// 1 - normal latency optimizations (about 50% of possible latency improvement of option 3)
- /// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)
- /// 3 - max latency optimizations
- /// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings - /// (best latency, but can mispronounce e.g. numbers and dates). - /// - /// - /// Optional, Callback to enable streaming audio as it comes in.
- /// Returns partial . - /// - /// Optional, . - /// . + [Obsolete("use overload with TextToSpeechRequest")] public async Task TextToSpeechAsync(string text, Voice voice, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, Func partialClipCallback = null, CancellationToken cancellationToken = default) { var defaultVoiceSettings = voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken); @@ -82,6 +47,7 @@ public async Task TextToSpeechAsync(string text, Voice voice, VoiceSe /// . public async Task TextToSpeechAsync(TextToSpeechRequest request, Func partialClipCallback = null, CancellationToken cancellationToken = default) { + request.VoiceSettings ??= await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken); using var payload = JsonSerializer.Serialize(request, ElevenLabsClient.JsonSerializationOptions).ToJsonStringContent(); var parameters = new Dictionary { @@ -93,7 +59,19 @@ public async Task TextToSpeechAsync(TextToSpeechRequest request, Func parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString()); } - using var postRequest = new HttpRequestMessage(HttpMethod.Post, GetUrl($"/{request.Voice.Id}{(partialClipCallback == null ? string.Empty : "/stream")}", parameters)); + var endpoint = $"/{request.Voice.Id}"; + + if (partialClipCallback != null) + { + endpoint += "/stream"; + } + + if (request.WithTimestamps) + { + endpoint += "/with-timestamps"; + } + + using var postRequest = new HttpRequestMessage(HttpMethod.Post, GetUrl(endpoint, parameters)); postRequest.Content = payload; var requestOption = partialClipCallback == null ? HttpCompletionOption.ResponseContentRead @@ -107,32 +85,85 @@ public async Task TextToSpeechAsync(TextToSpeechRequest request, Func throw new ArgumentException("Failed to parse clip id!"); } - await using var responseStream = await response.Content.ReadAsStreamAsync(cancellationToken).ConfigureAwait(false); - await using var memoryStream = new MemoryStream(); - int bytesRead; - var totalBytesRead = 0; - var buffer = new byte[8192]; + return request.WithTimestamps + ? await StreamWithTimeStampsAsync(response).ConfigureAwait(false) + : await StreamAsync(response).ConfigureAwait(false); - while ((bytesRead = await responseStream.ReadAsync(buffer, cancellationToken).ConfigureAwait(false)) > 0) + async Task StreamWithTimeStampsAsync(HttpResponseMessage messageResponse) { - await memoryStream.WriteAsync(new ReadOnlyMemory(buffer, 0, bytesRead), cancellationToken).ConfigureAwait(false); + await using var audioDataStream = new MemoryStream(); + var accumulatedTranscriptData = new List(); + await using var stream = await messageResponse.Content.ReadAsStreamAsync(cancellationToken).ConfigureAwait(false); + using var reader = new StreamReader(stream); - if (partialClipCallback != null) + while (await reader.ReadLineAsync(cancellationToken).ConfigureAwait(false) is { } line) { - try + const string data = "data: "; + const string done = "[DONE]"; + + if (line.StartsWith(data)) { line = line[data.Length..]; } + if (line == done) { break; } + if (string.IsNullOrWhiteSpace(line)) { continue; } + + var transcriptData = JsonSerializer.Deserialize(line, ElevenLabsClient.JsonSerializationOptions); + var timestampedTranscriptCharacters = (TimestampedTranscriptCharacter[])transcriptData.Alignment ?? []; + + if (partialClipCallback != null) { - await partialClipCallback(new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(memoryStream.GetBuffer(), totalBytesRead, bytesRead))).ConfigureAwait(false); + try + { + var partialClip = new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(transcriptData.AudioBytes), request.OutputFormat.GetSampleRate()) + { + TimestampedTranscriptCharacters = timestampedTranscriptCharacters + }; + await partialClipCallback(partialClip).ConfigureAwait(false); + } + catch (Exception e) + { + Console.WriteLine(e); + } } - catch (Exception e) + + accumulatedTranscriptData.AddRange(timestampedTranscriptCharacters); + await audioDataStream.WriteAsync(transcriptData.AudioBytes, 0, transcriptData.AudioBytes.Length, cancellationToken).ConfigureAwait(false); + } + + return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(audioDataStream.GetBuffer(), 0, (int)audioDataStream.Length), request.OutputFormat.GetSampleRate()) + { + TimestampedTranscriptCharacters = accumulatedTranscriptData.ToArray() + }; + } + + async Task StreamAsync(HttpResponseMessage messageResponse) + { + int bytesRead; + var totalBytesRead = 0; + var buffer = new byte[8192]; + await using var audioDataStream = new MemoryStream(); + await using var responseStream = await messageResponse.Content.ReadAsStreamAsync(cancellationToken).ConfigureAwait(false); + + while ((bytesRead = await responseStream.ReadAsync(buffer, cancellationToken).ConfigureAwait(false)) > 0) + { + await audioDataStream.WriteAsync(new ReadOnlyMemory(buffer, 0, bytesRead), cancellationToken).ConfigureAwait(false); + + if (partialClipCallback != null) { - Console.WriteLine(e); + try + { + var partialClip = new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(audioDataStream.GetBuffer(), totalBytesRead, bytesRead), request.OutputFormat.GetSampleRate()); + await partialClipCallback(partialClip).ConfigureAwait(false); + } + catch (Exception e) + { + Console.WriteLine(e); + } } + + totalBytesRead += bytesRead; } - totalBytesRead += bytesRead; + return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(audioDataStream.GetBuffer(), 0, totalBytesRead), request.OutputFormat.GetSampleRate()); } - - return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(memoryStream.GetBuffer(), 0, totalBytesRead)); } } } diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs index 1ca2a29..ca77782 100644 --- a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs @@ -10,8 +10,9 @@ namespace ElevenLabs.TextToSpeech { public sealed class TextToSpeechRequest { - public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings) : - this(null, text, voiceSettings: voiceSettings, model: model) + [Obsolete] + public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings) + : this(null, text, voiceSettings: voiceSettings, model: model) { } @@ -29,7 +30,7 @@ public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings /// Optional, that will override the default settings in . /// /// - /// Optional, to use. Defaults to . + /// Optional, to use. Defaults to . /// /// /// Output format of the generated audio.
@@ -47,8 +48,14 @@ public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings /// (best latency, but can mispronounce e.g. numbers and dates). /// /// - /// - /// + /// + /// + /// + /// + /// Optional, Language code (ISO 639-1) used to enforce a language for the model. Currently only supports language enforcement. + /// For other models, an error will be returned if language code is provided. + /// + /// public TextToSpeechRequest( Voice voice, string text, @@ -57,7 +64,12 @@ public TextToSpeechRequest( OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, Model model = null, - string previousText = null) + string previousText = null, + string nextText = null, + string[] previousRequestIds = null, + string[] nextRequestIds = null, + string languageCode = null, + bool withTimestamps = false) { if (string.IsNullOrWhiteSpace(text)) { @@ -81,12 +93,25 @@ public TextToSpeechRequest( } Text = text; - Model = model ?? Models.Model.MultiLingualV2; + Model = model ?? Models.Model.TurboV2_5; Voice = voice; - VoiceSettings = voiceSettings ?? voice.Settings ?? throw new ArgumentNullException(nameof(voiceSettings)); - PreviousText = previousText; + VoiceSettings = voiceSettings ?? voice.Settings; OutputFormat = outputFormat; OptimizeStreamingLatency = optimizeStreamingLatency; + PreviousText = previousText; + NextText = nextText; + if (previousRequestIds?.Length > 3) + { + previousRequestIds = previousRequestIds[..3]; + } + PreviousRequestIds = previousRequestIds; + if (nextRequestIds?.Length > 3) + { + nextRequestIds = nextRequestIds[..3]; + } + NextRequestIds = nextRequestIds; + LanguageCode = languageCode; + WithTimestamps = withTimestamps; } [JsonPropertyName("text")] @@ -99,7 +124,7 @@ public TextToSpeechRequest( public Voice Voice { get; } [JsonPropertyName("voice_settings")] - public VoiceSettings VoiceSettings { get; } + public VoiceSettings VoiceSettings { get; internal set; } [JsonPropertyName("previous_text")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)] @@ -110,5 +135,27 @@ public TextToSpeechRequest( [JsonIgnore] public int? OptimizeStreamingLatency { get; } + + [JsonPropertyName("next_text")] + public string NextText { get; } + + /// + /// A maximum of three next or previous history item ids can be sent + /// + [JsonPropertyName("previous_request_ids")] + public string[] PreviousRequestIds { get; } + + /// + /// A maximum of three next or previous history item ids can be sent + /// + [JsonPropertyName("next_request_ids")] + public string[] NextRequestIds { get; } + + + [JsonPropertyName("language_code")] + public string LanguageCode { get; } + + [JsonIgnore] + public bool WithTimestamps { get; } } } diff --git a/ElevenLabs-DotNet/TextToSpeech/TranscriptionResponse.cs b/ElevenLabs-DotNet/TextToSpeech/TranscriptionResponse.cs new file mode 100644 index 0000000..0f8cc48 --- /dev/null +++ b/ElevenLabs-DotNet/TextToSpeech/TranscriptionResponse.cs @@ -0,0 +1,21 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System; +using System.Text.Json.Serialization; + +namespace ElevenLabs.TextToSpeech +{ + internal sealed class TranscriptionResponse + { + [JsonInclude] + [JsonPropertyName("audio_base64")] + public string AudioBase64 { get; private set; } + + [JsonIgnore] + public byte[] AudioBytes => Convert.FromBase64String(AudioBase64); + + [JsonInclude] + [JsonPropertyName("alignment")] + public Alignment Alignment { get; private set; } + } +} diff --git a/README.md b/README.md index de5f34c..d1741b3 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ dotnet add package ElevenLabs-DotNet - [Text to Speech](#text-to-speech) - [Stream Text To Speech](#stream-text-to-speech) - [Voices](#voices) - - [Get Shared Voices](#get-shared-voices) :new: + - [Get Shared Voices](#get-shared-voices) - [Get All Voices](#get-all-voices) - [Get Default Voice Settings](#get-default-voice-settings) - [Get Voice](#get-voice) @@ -58,13 +58,13 @@ dotnet add package ElevenLabs-DotNet - [Samples](#samples) - [Download Voice Sample](#download-voice-sample) - [Delete Voice Sample](#delete-voice-sample) -- [Dubbing](#dubbing) :new: - - [Dub](#dub) :new: - - [Get Dubbing Metadata](#get-dubbing-metadata) :new: - - [Get Transcript for Dub](#get-transcript-for-dub) :new: - - [Get dubbed file](#get-dubbed-file) :new: - - [Delete Dubbing Project](#delete-dubbing-project) :new: -- [SFX Generation](#sfx-generation) :new: +- [Dubbing](#dubbing) + - [Dub](#dub) + - [Get Dubbing Metadata](#get-dubbing-metadata) + - [Get Transcript for Dub](#get-transcript-for-dub) + - [Get dubbed file](#get-dubbed-file) + - [Delete Dubbing Project](#delete-dubbing-project) +- [SFX Generation](#sfx-generation) - [History](#history) - [Get History](#get-history) - [Get History Item](#get-history-item) @@ -204,8 +204,8 @@ Convert text to speech. var api = new ElevenLabsClient(); var text = "The quick brown fox jumps over the lazy dog."; var voice = (await api.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault(); -var defaultVoiceSettings = await api.VoicesEndpoint.GetDefaultVoiceSettingsAsync(); -var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(text, voice, defaultVoiceSettings); +var request = new TextToSpeechRequest(voice, text); +var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(request); await File.WriteAllBytesAsync($"{voiceClip.Id}.mp3", voiceClip.ClipData.ToArray()); ``` @@ -219,7 +219,8 @@ var text = "The quick brown fox jumps over the lazy dog."; var voice = (await api.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault(); string fileName = "myfile.mp3"; using var outputFileStream = File.OpenWrite(fileName); -var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(text, voice, +var request = new TextToSpeechRequest(voice, text); +var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(request, partialClipCallback: async (partialClip) => { // Write the incoming data to the output file stream.