reab5555 commited on
Commit
55553bb
·
verified ·
1 Parent(s): db2d41c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -168
app.py CHANGED
@@ -157,177 +157,10 @@ def process_video(video_file):
157
  iface = gr.Interface(
158
  fn=process_video,
159
  inputs=gr.File(label="Upload Video File"),
160
- outputs=gr.Textbox(label="Analysis Result"),
161
  title="Video Analysis with Meta-Llama-3.1-8B-Instruct",
162
  description="Upload a video file to analyze using RAG techniques with Meta-Llama-3.1-8B-Instruct."
163
  )
164
 
165
  # Launch the app
166
  iface.launch()
167
-
168
- # Diarization script
169
- import os
170
- import torch
171
- import math
172
- from moviepy.editor import VideoFileClip, AudioFileClip
173
- from pyannote.audio import Pipeline
174
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
175
- import librosa
176
- import datetime
177
- from collections import defaultdict
178
- import numpy as np
179
- import spaces
180
-
181
- class LazyDiarizationPipeline:
182
- def __init__(self):
183
- self.pipeline = None
184
-
185
- @spaces.GPU(duration=120)
186
- def get_pipeline(self, diarization_access_token):
187
- if self.pipeline is None:
188
- self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=diarization_access_token)
189
- self.pipeline = self.pipeline.to(torch.device("cuda"))
190
- return self.pipeline
191
-
192
- lazy_diarization_pipeline = LazyDiarizationPipeline()
193
-
194
- class LazyTranscriptionPipeline:
195
- def __init__(self):
196
- self.model = None
197
- self.processor = None
198
- self.pipe = None
199
-
200
- @spaces.GPU(duration=120)
201
- def get_pipeline(self, language):
202
- if self.pipe is None:
203
- model_id = "openai/whisper-large-v3"
204
- self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
205
- model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
206
- )
207
- self.model.to(torch.device("cuda"))
208
- self.processor = AutoProcessor.from_pretrained(model_id)
209
- self.pipe = pipeline(
210
- "automatic-speech-recognition",
211
- model=self.model,
212
- tokenizer=self.processor.tokenizer,
213
- feature_extractor=self.processor.feature_extractor,
214
- max_new_tokens=128,
215
- chunk_length_s=30,
216
- batch_size=1,
217
- return_timestamps=True,
218
- torch_dtype=torch.float16,
219
- device=torch.device("cuda"),
220
- generate_kwargs={"language": language}
221
- )
222
- return self.pipe
223
-
224
- lazy_transcription_pipeline = LazyTranscriptionPipeline()
225
-
226
- def extract_audio(video_path, audio_path):
227
- video = VideoFileClip(video_path)
228
- audio = video.audio
229
- audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000)
230
-
231
- def format_timestamp(seconds):
232
- return str(datetime.timedelta(seconds=seconds)).split('.')[0]
233
-
234
- @spaces.GPU(duration=100)
235
- def transcribe_audio(audio_path, language):
236
- pipe = lazy_transcription_pipeline.get_pipeline(language)
237
-
238
- audio, sr = librosa.load(audio_path, sr=16000)
239
- duration = len(audio) / sr
240
- n_chunks = math.ceil(duration / 30)
241
- transcription_txt = ""
242
- transcription_chunks = []
243
-
244
- for i in range(n_chunks):
245
- start = i * 30 * sr
246
- end = min((i + 1) * 30 * sr, len(audio))
247
- audio_chunk = audio[start:end]
248
-
249
- # Convert the audio chunk to float32 numpy array
250
- audio_chunk = (audio_chunk * 32767).astype(np.float32)
251
-
252
- result = pipe(audio_chunk)
253
- transcription_txt += result["text"]
254
- for chunk in result["chunks"]:
255
- start_time, end_time = chunk["timestamp"]
256
- transcription_chunks.append({
257
- "start": start_time + i * 30,
258
- "end": end_time + i * 30,
259
- "text": chunk["text"]
260
- })
261
-
262
- print(f"Transcription Progress: {int(((i + 1) / n_chunks) * 100)}%")
263
-
264
- return transcription_txt, transcription_chunks
265
-
266
- def create_combined_srt(transcription_chunks, diarization, output_path):
267
- speaker_segments = []
268
- speaker_map = {}
269
- current_speaker_num = 1
270
-
271
- for segment, _, speaker in diarization.itertracks(yield_label=True):
272
- if speaker not in speaker_map:
273
- speaker_map[speaker] = f"Speaker {current_speaker_num}"
274
- current_speaker_num += 1
275
- speaker_segments.append((segment.start, segment.end, speaker_map[speaker]))
276
-
277
- with open(output_path, 'w', encoding='utf-8') as srt_file:
278
- for i, chunk in enumerate(transcription_chunks, 1):
279
- start_time, end_time = chunk["start"], chunk["end"]
280
- text = chunk["text"]
281
-
282
- # Find the corresponding speaker
283
- current_speaker = "Unknown"
284
- for seg_start, seg_end, speaker in speaker_segments:
285
- if seg_start <= start_time < seg_end:
286
- current_speaker = speaker
287
- break
288
-
289
- # Format timecodes as h:mm:ss (without leading zeros for hours)
290
- start_str = format_timestamp(start_time).split('.')[0].lstrip('0')
291
- end_str = format_timestamp(end_time).split('.')[0].lstrip('0')
292
-
293
- srt_file.write(f"{i}\n")
294
- srt_file.write(f"{{{current_speaker}}}\n time: ({start_str} --> {end_str})\n text: {text}\n\n")
295
-
296
- # Add dominant speaker information
297
- speaker_durations = defaultdict(float)
298
- for seg_start, seg_end, speaker in speaker_segments:
299
- speaker_durations[speaker] += seg_end - seg_start
300
-
301
- dominant_speaker = max(speaker_durations, key=speaker_durations.get)
302
- dominant_duration = speaker_durations[dominant_speaker]
303
-
304
- with open(output_path, 'a', encoding='utf-8') as srt_file:
305
- dominant_duration_str = format_timestamp(dominant_duration).split('.')[0].lstrip('0')
306
- srt_file.write(f"\nMost dominant speaker: {dominant_speaker} with total duration {dominant_duration_str}\n")
307
-
308
- @spaces.GPU(duration=100)
309
- def process_video(video_path, diarization_access_token, language):
310
- base_name = os.path.splitext(video_path)[0]
311
- audio_path = f"{base_name}.wav"
312
- extract_audio(video_path, audio_path)
313
-
314
- # Diarization
315
- print("Performing diarization...")
316
- pipeline = lazy_diarization_pipeline.get_pipeline(diarization_access_token)
317
- diarization = pipeline(audio_path)
318
- print("Diarization complete.")
319
-
320
- # Transcription
321
- print("Performing transcription...")
322
- transcription, chunks = transcribe_audio(audio_path, language)
323
- print("Transcription complete.")
324
-
325
- # Create combined SRT file
326
- combined_srt_path = f"{base_name}_combined.srt"
327
- create_combined_srt(chunks, diarization, combined_srt_path)
328
- print(f"Combined SRT file created and saved to {combined_srt_path}")
329
-
330
- # Clean up
331
- os.remove(audio_path)
332
-
333
- return combined_srt_path
 
157
  iface = gr.Interface(
158
  fn=process_video,
159
  inputs=gr.File(label="Upload Video File"),
160
+ outputs=gr.Textbox(label="Results"),
161
  title="Video Analysis with Meta-Llama-3.1-8B-Instruct",
162
  description="Upload a video file to analyze using RAG techniques with Meta-Llama-3.1-8B-Instruct."
163
  )
164
 
165
  # Launch the app
166
  iface.launch()