-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathdemo_mic.py
230 lines (193 loc) · 8.68 KB
/
demo_mic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import cv2
from dataclasses import dataclass, asdict
import glob
import numpy as np
import os
import pyaudio
import scipy.io as sio
from scipy.io import wavfile
import shutil
import torch
import torch.nn.functional as F
import torchvision.utils as vutils
import webrtcvad
from mfcc import MFCC
from config import NETWORKS_PARAMETERS
from network import get_network, SynergyNet
from utils import voice2face, read_obj
from vad import read_wave, write_wave, frame_generator, vad_collector
from pyaudio_recording import Recorder
from utilf.render import render_vert
@dataclass
class StreamParams:
format: int = pyaudio.paInt16
channels: int = 1
rate: int = 16000
frames_per_buffer: int = 1024
input: bool = True
output: bool = False
def to_dict(self) -> dict:
return asdict(self)
def rm_sil(voice_file, vad_obj):
"""
remove silence
"""
audio, sample_rate = read_wave(voice_file)
frames = frame_generator(20, audio, sample_rate)
frames = list(frames)
segments = vad_collector(sample_rate, 20, 50, vad_obj, frames)
if os.path.exists('tmp/'):
shutil.rmtree('tmp/')
os.makedirs('tmp/')
wave_data = []
for i, segment in enumerate(segments):
segment_file = 'tmp/' + str(i) + '.wav'
write_wave(segment_file, segment, sample_rate)
wave_data.append(wavfile.read(segment_file)[1])
shutil.rmtree('tmp/')
if wave_data:
vad_voice = np.concatenate(wave_data).astype('int16')
return vad_voice
def get_fbank(voice, mfc_obj):
"""
process audio and create mel-spectrogram
"""
# Extract log mel-spectrogra
fbank = mfc_obj.sig2logspec(voice).astype('float32')
# Mean and variance normalization of each mel-frequency
fbank = fbank - fbank.mean(axis=0)
fbank = fbank / (fbank.std(axis=0)+np.finfo(np.float32).eps)
# If the duration of a voice recording is less than 10 seconds (1000 frames),
# repeat the recording until it is longer than 10 seconds and crop.
full_frame_number = 1000
init_frame_number = fbank.shape[0]
while fbank.shape[0] < full_frame_number:
fbank = np.append(fbank, fbank[0:init_frame_number], axis=0)
fbank = fbank[0:full_frame_number,:]
return fbank
def voice2face(e_net, g_net, voice_file, vad_obj, mfc_obj, GPU=True):
vad_voice = rm_sil(voice_file, vad_obj)
fbank = get_fbank(vad_voice, mfc_obj)
fbank = fbank.T[np.newaxis, ...]
fbank = torch.from_numpy(fbank.astype('float32'))
if GPU:
fbank = fbank.cuda()
embedding = e_net(fbank)
embedding = F.normalize(embedding)
face = g_net(embedding)
return face
def main():
# recording and save under the root
filename = "audio.wav"
# stream_params = StreamParams()
# recorder = Recorder(stream_params)
# # record for 5 seconds
# recorder.record(5, filename)
# initialization
# voice activity detector, aggressiveness = 2
vad_obj = webrtcvad.Vad(2)
# Mel-Frequency extractor
mfc_obj = MFCC(nfilt=64, lowerf=20., upperf=7200., samprate=16000, nfft=1024, wlen=0.025)
# net definition
e_net, _ = get_network('e', NETWORKS_PARAMETERS, train=False)
g_net, _ = get_network('g', NETWORKS_PARAMETERS, train=False)
# building models: unsupervised
image3D = SynergyNet(pretrained=False, last_CN=None).cuda().eval()
backbone_ckpt = torch.load(NETWORKS_PARAMETERS['image3D']['model_path'])
image3D.load_state_dict(backbone_ckpt)
# SynergyNet pretrained network for getting pose
image3D_pretrained = SynergyNet(pretrained=True).cuda().eval()
# data and config
up_layer = torch.nn.Upsample((120,120), mode='bilinear', align_corners=True)
tri = sio.loadmat('./train.configs/tri.mat')['tri']
# default savepath
FOLDER_ROOT = 'data/results/'
if not os.path.exists(FOLDER_ROOT):
os.makedirs(FOLDER_ROOT)
with torch.no_grad():
# voice2face
face_image = voice2face(e_net, g_net, filename, vad_obj, mfc_obj, NETWORKS_PARAMETERS['GPU'])
face_image = up_layer(face_image)
# Pose from 3DDFA-V2
pose = image3D_pretrained(face_image, return_onlypose=True)
R, off = image3D_pretrained.parse_param_102_pose(pose)
# Alignment with synthesized image
prediction_fr = image3D(face_image)
prediction = R @ prediction_fr + off
# calculation between mean male and female shape and classify the gender by meshes
#print(prediction_fr.requires_grad)
prediction_fr_np = prediction_fr.squeeze(0).cpu().numpy()
prediction_fr_np = np.transpose(prediction_fr_np, (1,0))
mean_male = read_obj('male.obj') # 53215 * 3
mean_female = read_obj('female.obj') # 53215 * 3
N_vertices = prediction_fr_np.shape[0] #53215
error_male = np.linalg.norm(prediction_fr_np - mean_male)/ N_vertices
error_female = np.linalg.norm(prediction_fr_np - mean_female)/ N_vertices
pred_midD = np.linalg.norm(prediction_fr_np[2130]-prediction_fr_np[15003])
pred_foreD = np.linalg.norm(prediction_fr_np[1678]-prediction_fr_np[42117])
pred_cheekD = np.linalg.norm(prediction_fr_np[2294]-prediction_fr_np[13635])
pred_earD = np.linalg.norm(prediction_fr_np[20636]-prediction_fr_np[34153])
print("-------------------------")
if error_male < error_female:
print("This is a male's voice")
print("Statistics from the predicted mesh and mean gender mesh")
target_foreD = np.linalg.norm(mean_male[1678]-mean_male[42117])
target_cheekD = np.linalg.norm(mean_male[2294]-mean_male[13635])
target_earD = np.linalg.norm(mean_male[20636]-mean_male[34153])
target_midD = np.linalg.norm(mean_male[2130]-mean_male[15003])
ratio_fore = (pred_foreD-target_foreD)/target_foreD
ratio_cheek = (pred_cheekD-target_cheekD)/target_cheekD
ratio_ear = (pred_earD-target_earD)/target_earD
ratio_mid = (pred_midD-target_midD)/target_midD
print(f"The forehead is {ratio_fore*100}% than the mean male shape")
print(f"The cheek-to-cheek is {ratio_cheek*100}% than the mean male shape")
print(f"The ear-to-ear is {ratio_ear*100}% than the mean male shape")
print(f"The midline is {ratio_mid*100}% than the mean male shape")
else:
print("This is a female's voice")
print("Statistics from the predicted mesh and mean gender mesh")
target_foreD = np.linalg.norm(mean_female[1678]-mean_female[42117])
target_cheekD = np.linalg.norm(mean_female[2294]-mean_female[13635])
target_earD = np.linalg.norm(mean_female[20636]-mean_female[34153])
target_midD = np.linalg.norm(mean_female[2130]-mean_female[15003])
ratio_fore = (pred_foreD-target_foreD)/target_foreD
ratio_cheek = (pred_cheekD-target_cheekD)/target_cheekD
ratio_ear = (pred_earD-target_earD)/target_earD
ratio_mid = (pred_midD-target_midD)/target_midD
print(f"The forehead is {ratio_fore*100}% than the mean female shape")
print(f"The cheek-to-cheek is {ratio_cheek*100}% than the femean male shape")
print(f"The ear-to-ear is {ratio_ear*100}% than the mean female shape")
print(f"The midline is {ratio_mid*100}% than the mean female shape")
print("-------------------------")
wide_shape = read_obj('wide.obj')
skinny_shape = read_obj('skinny.obj')
regular_shape = read_obj('regular.obj')
slim_shape = read_obj('slim.obj')
error_wide = np.linalg.norm(prediction_fr_np - wide_shape)/ N_vertices
error_skinny = np.linalg.norm(prediction_fr_np - skinny_shape)/ N_vertices
error_regular = np.linalg.norm(prediction_fr_np - regular_shape)/ N_vertices
error_slim = np.linalg.norm(prediction_fr_np - slim_shape)/ N_vertices
err_type = np.array([error_wide, error_skinny, error_regular, error_slim])
index = np.argsort(err_type)[0]
if index == 0:
print("The face shape is closer to WIDE")
elif index == 1:
print(f"The face shape is closer to SKINNY")
elif index == 2:
print(f"The face shape is closer to REGULAR")
elif index == 3:
print(f"The face shape is closer to SLIM")
print("-------------------------")
# transform to image coordinate space
prediction[:, 1, :] = 127 - prediction[:, 1, :]
save_name = os.path.join(FOLDER_ROOT, 'micIn')
img = (((face_image[0].clamp(-1,1))*127.5)+128).detach().cpu().numpy().astype(np.uint8)
img = np.transpose(img, (1,2,0))
img = img[:,:,[2,1,0]]
pred = prediction[0].detach().cpu().numpy()
# save
cv2.imwrite(save_name+'_image.png', img)
render_vert(img, pred, alpha=1.0, wfp=save_name+'_overlap.png')
vutils.save_image(face_image.detach().clamp(-1,1), filename.replace('.wav', '.png'), normalize=True)
if __name__ == '__main__':
main()