generate_cloned_voice.py

# ## Voice Style Control Demo
# before run this python file please make sure all the libraries in requirements.txt needed are installed

import os
import torch
import sys

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# ### Initialization
def generate_cloned_voice(reference_speaker_path, generated_text):
    # Get the current working directory
    cwd = os.getcwd()
    # Add the 'openvoice' directory to the Python path
    sys.path.append(os.path.join(cwd, 'openvoice'))

    from openvoice import se_extractor
    from openvoice.api import BaseSpeakerTTS, ToneColorConverter

    ckpt_base = 'checkpoints/base_speakers/EN'
    ckpt_converter = 'checkpoints/converter'
    device="cuda:0" if torch.cuda.is_available() else "cpu"
    output_dir = 'outputs'

    base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
    base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')

    tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
    tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

    os.makedirs(output_dir, exist_ok=True)

    ### Obtain Tone Color Embedding

    # The `source_se` is the tone color embedding of the base speaker. 
    # It is an average of multiple sentences generated by the base speaker. We directly provide the result here but
    # the readers feel free to extract `source_se` by themselves.

    source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)

    reference_speaker = reference_speaker_path
    target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)

    # ### Inference

    save_path = f'{output_dir}/output_en_default.wav'

    # Run the base speaker tts
    text = generated_text
    src_path = f'{output_dir}/tmp.wav'
    base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)

    # Run the tone color converter
    encode_message = "@MyShell"
    tone_color_converter.convert(
        audio_src_path=src_path, 
        src_se=source_se, 
        tgt_se=target_se, 
        output_path=save_path,

        message=encode_message)
    
    """
    # **Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9.

    source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)
    save_path = f'{output_dir}/output_whispering.wav'

    # Run the base speaker tts
    text = "This audio is generated by OpenVoice. Played using Furhat Robot"
    src_path = f'{output_dir}/tmp.wav'
    base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)

    # Run the tone color converter
    encode_message = "@MyShell"
    tone_color_converter.convert(
        audio_src_path=src_path, 
        src_se=source_se, 
        tgt_se=target_se, 
        output_path=save_path,
        message=encode_message)

    # **Try with different languages.
    ckpt_base = 'checkpoints/base_speakers/ZH'
    base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
    base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')

    source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)
    save_path = f'{output_dir}/output_chinese.wav'

        # Run the base speaker tts
    text = "今天天气真好，我们一起出去吃饭吧。"
    src_path = f'{output_dir}/tmp.wav'
    base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)

    # Run the tone color converter
    encode_message = "@MyShell"
    tone_color_converter.convert(
        audio_src_path=src_path, 
        src_se=source_se, 
        tgt_se=target_se, 
        output_path=save_path,
        message=encode_message)
    """
    print("Cloned Voice Generated!")

# **Tech for good.** For people who will deploy OpenVoice for public usage:
# We offer you the option to add watermark to avoid potential misuse. 
# Please see the ToneColorConverter class. 
# **MyShell reserves the ability to detect whether an audio is generated by OpenVoice**, 
# no matter whether the watermark is added or not.

#if __name__ == "__main__":
    #generate_cloned_voice("resources/demo_speaker0.mp3","I love u")