This repository has been archived by the owner on May 27, 2024. It is now read-only.
generated from bananaml/serverless-template
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathapp.py
149 lines (127 loc) · 5.31 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import time
import wave
import torch
import base64
import whisper
import datetime
import contextlib
import numpy as np
import pandas as pd
from io import BytesIO
from pytube import YouTube
from pyannote.audio import Audio
from pyannote.core import Segment
from sklearn.cluster import AgglomerativeClustering
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
# Init is ran on server startup
# Load your model to GPU as a global variable here using the variable name "model"
def init():
global model
global model_name
global embedding_model
#medium, large-v1, large-v2
model_name = "large-v2"
model = whisper.load_model(model_name)
embedding_model = PretrainedSpeakerEmbedding(
"speechbrain/spkrec-ecapa-voxceleb",
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
def convert_time(secs):
return datetime.timedelta(seconds=round(secs))
def get_youtube(video_url):
yt = YouTube(video_url)
abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
print("-----Success downloaded video-----")
print(abs_video_path)
return abs_video_path
def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
model = whisper.load_model(whisper_model)
time_start = time.time()
if(video_file_path == None):
raise ValueError("Error no video input")
print(video_file_path)
try:
# Read and convert youtube video
_,file_ending = os.path.splitext(f'{video_file_path}')
print(f'file enging is {file_ending}')
audio_file = video_file_path.replace(file_ending, ".wav")
print("-----starting conversion to wav-----")
os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
# Get duration
with contextlib.closing(wave.open(audio_file,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
print(f"conversion to wav ready, duration of audio file: {duration}")
# Transcribe audio
options = dict(language=selected_source_lang, beam_size=5, best_of=5)
transcribe_options = dict(task="transcribe", **options)
result = model.transcribe(audio_file, **transcribe_options)
segments = result["segments"]
print("starting whisper done with whisper")
except Exception as e:
raise RuntimeError("Error converting video to audio")
try:
# Create embedding
def segment_embedding(segment):
audio = Audio()
start = segment["start"]
# Whisper overshoots the end timestamp in the last segment
end = min(duration, segment["end"])
clip = Segment(start, end)
waveform, sample_rate = audio.crop(audio_file, clip)
return embedding_model(waveform[None])
embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
embeddings[i] = segment_embedding(segment)
embeddings = np.nan_to_num(embeddings)
print(f'Embedding shape: {embeddings.shape}')
# Assign speaker label
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
labels = clustering.labels_
for i in range(len(segments)):
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
# Make output
objects = {
'Start' : [],
'End': [],
'Speaker': [],
'Text': []
}
text = ''
for (i, segment) in enumerate(segments):
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
objects['Start'].append(str(convert_time(segment["start"])))
objects['Speaker'].append(segment["speaker"])
if i != 0:
objects['End'].append(str(convert_time(segments[i - 1]["end"])))
objects['Text'].append(text)
text = ''
text += segment["text"] + ' '
objects['End'].append(str(convert_time(segments[i - 1]["end"])))
objects['Text'].append(text)
time_end = time.time()
time_diff = time_end - time_start
system_info = f"""-----Processing time: {time_diff:.5} seconds-----"""
print(system_info)
return pd.DataFrame(objects)
except Exception as e:
raise RuntimeError("Error Running inference with local model", e)
# Inference is ran for every server call
# Reference your preloaded global model variable here.
def inference(model_inputs:dict) -> dict:
global model
global model_name
global embedding_model
# Parse out your arguments
youtube_url = model_inputs.get('youtube_url', "https://www.youtube.com/watch?v=-UX0X45sYe4")
selected_source_lang = model_inputs.get('language', "en")
number_speakers = model_inputs.get('num_speakers', 2)
if youtube_url == None:
return {'message': "No input provided"}
# Run the model
video_in = get_youtube(youtube_url)
transcription_df = speech_to_text(video_in, selected_source_lang, model_name, number_speakers)
# print(transcription_df)
# Return the results as a dictionary
return transcription_df.to_json()