forked from tcxnh/Voice-Integration-with-Furhat-Robot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_cloned_voice.py
115 lines (88 loc) · 4.42 KB
/
generate_cloned_voice.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# ## Voice Style Control Demo
# before run this python file please make sure all the libraries in requirements.txt needed are installed
import os
import torch
import sys
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# ### Initialization
def generate_cloned_voice(reference_speaker_path, generated_text):
# Get the current working directory
cwd = os.getcwd()
# Add the 'openvoice' directory to the Python path
sys.path.append(os.path.join(cwd, 'openvoice'))
from openvoice import se_extractor
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
ckpt_base = 'checkpoints/base_speakers/EN'
ckpt_converter = 'checkpoints/converter'
device="cuda:0" if torch.cuda.is_available() else "cpu"
output_dir = 'outputs'
base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
os.makedirs(output_dir, exist_ok=True)
### Obtain Tone Color Embedding
# The `source_se` is the tone color embedding of the base speaker.
# It is an average of multiple sentences generated by the base speaker. We directly provide the result here but
# the readers feel free to extract `source_se` by themselves.
source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)
reference_speaker = reference_speaker_path
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)
# ### Inference
save_path = f'{output_dir}/output_en_default.wav'
# Run the base speaker tts
text = generated_text
src_path = f'{output_dir}/tmp.wav'
base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)
# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path,
message=encode_message)
"""
# **Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9.
source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)
save_path = f'{output_dir}/output_whispering.wav'
# Run the base speaker tts
text = "This audio is generated by OpenVoice. Played using Furhat Robot"
src_path = f'{output_dir}/tmp.wav'
base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)
# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path,
message=encode_message)
# **Try with different languages.
ckpt_base = 'checkpoints/base_speakers/ZH'
base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')
source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)
save_path = f'{output_dir}/output_chinese.wav'
# Run the base speaker tts
text = "今天天气真好,我们一起出去吃饭吧。"
src_path = f'{output_dir}/tmp.wav'
base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)
# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path,
message=encode_message)
"""
print("Cloned Voice Generated!")
# **Tech for good.** For people who will deploy OpenVoice for public usage:
# We offer you the option to add watermark to avoid potential misuse.
# Please see the ToneColorConverter class.
# **MyShell reserves the ability to detect whether an audio is generated by OpenVoice**,
# no matter whether the watermark is added or not.
#if __name__ == "__main__":
#generate_cloned_voice("resources/demo_speaker0.mp3","I love u")