Speech model fully released

JuanFMontesinos · Jul 4, 2022 · 3dd0fdb · 3dd0fdb
1 parent b3dab39
commit 3dd0fdb
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -30,21 +30,29 @@ cd ..
 ### Requirements  
 The core computations (the model itself) depends on python, pytorch, einops and torchaudio. To run demos and visualizations many other libraries are required.
 
-*Note: Currently, only the offline computation is supported in a user-friendly way.*
-
 In case of incompatibilities due to future updates, the tested commit is:  
 `https://github.com/cleardusk/3DDFA_V2/tree/1b6c67601abffc1e9f248b291708aef0e43b55ae`
 
 ## Running a demo
 
 Demos are located in the `demo_samples` folder.  
-Running on `interview.mp4` example:
+To process `interview.mp4`  
+Modify `inference_interview.py`:  
+```
+device = 'cuda:0'
+path = 'demo_samples/interview'
+compute_landmarks = True
+```
+compute_landmarks implies landmarks will be computed on-the-fly (useful for production-ready models). This needs
+curated data (videos cropped around the face etcetera...)
+compute_landmarks = False uses preprocessed landmarks extracted from `preprocessing_interview.py`.  
 
 ```
 python preprocessing_interview.py
 python inference_interview.py
 ```
 
+
 ## Latency
 
 |               | Preprocessing |   Inference   |             | Preprocessing + Inference |

diff --git a/demo_samples/comedy.mp4 b/demo_samples/comedy.mp4
diff --git a/inference_interview.py b/inference_interview.py
@@ -7,12 +7,20 @@
 
 device = 'cuda:0'
 path = 'demo_samples/interview'
+compute_landmarks = False
+# if true landmarks are computed on-the-fly in the gpu, else it uses precomputed landmarks
+
 # Loading the data
-speaker1_face = torch.from_numpy(np.load(f'{path}/speaker1_ld.npy')).to(device)
-speaker2_face = torch.from_numpy(np.load(f'{path}/speaker2_ld.npy')).to(device)
+if compute_landmarks:
+    speaker2_face = torch.from_numpy(np.load(f'{path}/speaker2.npy')).to(device)
+    speaker1_face = torch.from_numpy(np.load(f'{path}/speaker1.npy')).to(device)
+else:
+    speaker1_face = torch.from_numpy(np.load(f'{path}/speaker1_ld.npy')).to(device)
+    speaker2_face = torch.from_numpy(np.load(f'{path}/speaker2_ld.npy')).to(device)
+
 mixture = torch.from_numpy(read(f'{path}/audio.wav')[1]).to(device)
 
-model = vovit.End2EndVoViT(model_name='VoViT_speech', debug={}).to(device)
+model = vovit.End2EndVoViT(model_name='VoViT_speech', extract_landmarks=compute_landmarks, debug={}).to(device)
 model.eval()
 with torch.no_grad():
     pred_s1 = model.forward_unlimited(mixture, speaker1_face)
@@ -27,3 +35,8 @@
     plt.savefig(f'{path}/s2_sp.png')
     write(f'{path}/speaker1_estimated.wav', vovit.core.AUDIO_SAMPLERATE, wav_s1)
     write(f'{path}/speaker2_estimated.wav', vovit.core.AUDIO_SAMPLERATE, wav_s2)
+
+    vovit.utils.ffmpeg_join(f'{path}/speaker1_landmarks.mp4', f'{path}/speaker1_estimated.wav',
+                            f'{path}/speaker1_estimated.mp4')
+    vovit.utils.ffmpeg_join(f'{path}/speaker2_landmarks.mp4', f'{path}/speaker2_estimated.wav',
+                            f'{path}/speaker2_estimated.mp4')
diff --git a/preprocessing_interview.py b/preprocessing_interview.py
@@ -15,9 +15,6 @@
 video = np.stack(io.mimread(path + '.mp4', memtest=False))
 # Shape is (438, 720, 1280, 3)
 
-plt.imshow(video[200])
-plt.show()
-
 # Crop the face for each speaker
 speaker1 = video[:8 * video_fps, 100:450, 250:550]
 speaker2 = video[:8 * video_fps, 50:400, 750:1100]
@@ -39,8 +36,8 @@
 io.mimwrite(f'{path}/speaker1.mp4', speaker1, fps=video_fps)
 io.mimwrite(f'{path}/speaker2.mp4', speaker2, fps=video_fps)
 
-audio, sr = librosa.load('demo_samples/interview.mp4', sr=vovit.core.AUDIO_SAMPLERATE, duration=9)
-write('demo_samples/interview/audio.wav', sr, audio[:vovit.core.AUDIO_SAMPLERATE * 8])
+audio, sr = librosa.load(f'{path}.mp4', sr=vovit.core.AUDIO_SAMPLERATE, duration=9)
+write(f'{path}/audio.wav', sr, audio[:vovit.core.AUDIO_SAMPLERATE * 8])
 
 vovit.utils.process_video(f'{path}/speaker1.mp4',
                           video_dst=f'{path}/speaker1_landmarks.mp4',