reab5555 commited on
Commit
156337d
·
verified ·
1 Parent(s): d3be1e6

Update diarization.py

Browse files
Files changed (1) hide show
  1. diarization.py +47 -38
diarization.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
  import torch
3
- import torchvision
4
  import math
5
  from moviepy.editor import VideoFileClip, AudioFileClip
6
  from pyannote.audio import Pipeline
@@ -11,6 +10,51 @@ from collections import defaultdict
11
  import numpy as np
12
  import spaces
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def extract_audio(video_path, audio_path):
15
  video = VideoFileClip(video_path)
16
  audio = video.audio
@@ -21,30 +65,7 @@ def format_timestamp(seconds):
21
 
22
  @spaces.GPU(duration=300)
23
  def transcribe_audio(audio_path, language):
24
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
25
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
26
- model_id = "openai/whisper-large-v3"
27
-
28
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
29
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
30
- )
31
- model.to(device)
32
-
33
- processor = AutoProcessor.from_pretrained(model_id)
34
-
35
- pipe = pipeline(
36
- "automatic-speech-recognition",
37
- model=model,
38
- tokenizer=processor.tokenizer,
39
- feature_extractor=processor.feature_extractor,
40
- max_new_tokens=128,
41
- chunk_length_s=30,
42
- batch_size=1,
43
- return_timestamps=True,
44
- torch_dtype=torch_dtype,
45
- device=device,
46
- generate_kwargs={"language": language}
47
- )
48
 
49
  audio, sr = librosa.load(audio_path, sr=16000)
50
  duration = len(audio) / sr
@@ -118,15 +139,13 @@ def create_combined_srt(transcription_chunks, diarization, output_path):
118
 
119
  @spaces.GPU(duration=600)
120
  def process_video(video_path, diarization_access_token, language):
121
- import torch
122
  base_name = os.path.splitext(video_path)[0]
123
  audio_path = f"{base_name}.wav"
124
  extract_audio(video_path, audio_path)
125
 
126
  # Diarization
127
  print("Performing diarization...")
128
- pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=diarization_access_token)
129
- pipeline = pipeline.to("cuda")
130
  diarization = pipeline(audio_path)
131
  print("Diarization complete.")
132
 
@@ -144,13 +163,3 @@ def process_video(video_path, diarization_access_token, language):
144
  os.remove(audio_path)
145
 
146
  return combined_srt_path
147
-
148
- if __name__ == "__main__":
149
- video_path = r"C:\Users\reab5\Downloads\MediaHuman\Music\test1.mp4"
150
- # Get Hugging Face token from Space secret
151
- access_token = os.environ.get('hf_secret')
152
- if not access_token:
153
- raise ValueError("HF_TOKEN not found in environment variables. Please set it in the Space secrets.")
154
-
155
- language = "en"
156
- process_video(video_path, access_token, language)
 
1
  import os
2
  import torch
 
3
  import math
4
  from moviepy.editor import VideoFileClip, AudioFileClip
5
  from pyannote.audio import Pipeline
 
10
  import numpy as np
11
  import spaces
12
 
13
+ class LazyDiarizationPipeline:
14
+ def __init__(self):
15
+ self.pipeline = None
16
+
17
+ @spaces.GPU(duration=120)
18
+ def get_pipeline(self, diarization_access_token):
19
+ if self.pipeline is None:
20
+ self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=diarization_access_token)
21
+ self.pipeline = self.pipeline.to("cuda")
22
+ return self.pipeline
23
+
24
+ lazy_diarization_pipeline = LazyDiarizationPipeline()
25
+
26
+ class LazyTranscriptionPipeline:
27
+ def __init__(self):
28
+ self.model = None
29
+ self.processor = None
30
+ self.pipe = None
31
+
32
+ @spaces.GPU(duration=120)
33
+ def get_pipeline(self, language):
34
+ if self.pipe is None:
35
+ model_id = "openai/whisper-large-v3"
36
+ self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
37
+ model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
38
+ )
39
+ self.model.to("cuda")
40
+ self.processor = AutoProcessor.from_pretrained(model_id)
41
+ self.pipe = pipeline(
42
+ "automatic-speech-recognition",
43
+ model=self.model,
44
+ tokenizer=self.processor.tokenizer,
45
+ feature_extractor=self.processor.feature_extractor,
46
+ max_new_tokens=128,
47
+ chunk_length_s=30,
48
+ batch_size=1,
49
+ return_timestamps=True,
50
+ torch_dtype=torch.float16,
51
+ device="cuda",
52
+ generate_kwargs={"language": language}
53
+ )
54
+ return self.pipe
55
+
56
+ lazy_transcription_pipeline = LazyTranscriptionPipeline()
57
+
58
  def extract_audio(video_path, audio_path):
59
  video = VideoFileClip(video_path)
60
  audio = video.audio
 
65
 
66
  @spaces.GPU(duration=300)
67
  def transcribe_audio(audio_path, language):
68
+ pipe = lazy_transcription_pipeline.get_pipeline(language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  audio, sr = librosa.load(audio_path, sr=16000)
71
  duration = len(audio) / sr
 
139
 
140
  @spaces.GPU(duration=600)
141
  def process_video(video_path, diarization_access_token, language):
 
142
  base_name = os.path.splitext(video_path)[0]
143
  audio_path = f"{base_name}.wav"
144
  extract_audio(video_path, audio_path)
145
 
146
  # Diarization
147
  print("Performing diarization...")
148
+ pipeline = lazy_diarization_pipeline.get_pipeline(diarization_access_token)
 
149
  diarization = pipeline(audio_path)
150
  print("Diarization complete.")
151
 
 
163
  os.remove(audio_path)
164
 
165
  return combined_srt_path