reab5555 commited on
Commit
09abe1d
·
verified ·
1 Parent(s): f1d8e24

Update transcription_diarization.py

Browse files
Files changed (1) hide show
  1. transcription_diarization.py +20 -63
transcription_diarization.py CHANGED
@@ -4,12 +4,15 @@ import gc
4
  import math
5
  from moviepy.editor import VideoFileClip
6
  from pyannote.audio import Pipeline
7
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
8
  import librosa
9
  import soundfile as sf
10
  import datetime
11
  from collections import defaultdict
12
  import numpy as np
 
 
 
 
13
 
14
  class LazyDiarizationPipeline:
15
  def __init__(self):
@@ -25,81 +28,37 @@ class LazyDiarizationPipeline:
25
  gc.collect()
26
  return self.pipeline
27
 
28
-
29
- class LazyTranscriptionPipeline:
30
- def __init__(self):
31
- self.model = None
32
- self.processor = None
33
- self.pipe = None
34
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
-
36
- def get_pipeline(self):
37
- if self.pipe is None:
38
- model_id = "openai/whisper-large-v3"
39
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
40
- self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
41
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
42
- )
43
- self.model.to(self.device)
44
- self.processor = AutoProcessor.from_pretrained(model_id)
45
- self.pipe = pipeline(
46
- "automatic-speech-recognition",
47
- model=self.model,
48
- tokenizer=self.processor.tokenizer,
49
- feature_extractor=self.processor.feature_extractor,
50
- chunk_length_s=30,
51
- return_timestamps=True,
52
- device=self.device
53
- )
54
- return self.pipe
55
-
56
-
57
  lazy_diarization_pipeline = LazyDiarizationPipeline()
58
- lazy_transcription_pipeline = LazyTranscriptionPipeline()
59
 
60
  def extract_audio(video_path, audio_path):
61
  video = VideoFileClip(video_path)
62
  audio = video.audio
63
  audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000)
64
 
65
-
66
  def format_timestamp(seconds):
67
  return str(datetime.timedelta(seconds=seconds)).split('.')[0]
68
 
69
-
70
  def transcribe_audio(audio_path, language):
71
- pipe = lazy_transcription_pipeline.get_pipeline()
72
-
73
- audio, sr = librosa.load(audio_path, sr=16000)
74
- duration = len(audio) / sr
75
- n_chunks = math.ceil(duration / 30)
76
- transcription_txt = ""
 
 
 
77
  transcription_chunks = []
78
 
79
- for i in range(n_chunks):
80
- start = i * 30 * sr
81
- end = min((i + 1) * 30 * sr, len(audio))
82
- audio_chunk = audio[start:end]
83
- audio_chunk = (audio_chunk * 32767).astype(np.float32)
84
-
85
- result = pipe(audio_chunk, generate_kwargs={"language": language, "task": "transcribe"})
86
-
87
- transcription_txt += result["text"]
88
- for chunk in result["chunks"]:
89
- start_time, end_time = chunk["timestamp"]
90
- if start_time is None:
91
- start_time = 0
92
- if end_time is None:
93
- end_time = 0
94
- transcription_chunks.append({
95
- "start": start_time + i * 30,
96
- "end": end_time + i * 30,
97
- "text": chunk["text"]
98
- })
99
 
100
  return transcription_txt, transcription_chunks
101
 
102
-
103
  def diarize_audio(audio_path, pipeline, max_speakers):
104
  # Load the entire audio file
105
  audio, sr = librosa.load(audio_path, sr=16000)
@@ -118,7 +77,6 @@ def diarize_audio(audio_path, pipeline, max_speakers):
118
 
119
  return diarization
120
 
121
-
122
  def create_combined_srt(transcription_chunks, diarization, output_path, max_speakers):
123
  speaker_segments = []
124
  speaker_durations = defaultdict(float)
@@ -155,7 +113,6 @@ def create_combined_srt(transcription_chunks, diarization, output_path, max_spea
155
  duration_str = format_timestamp(duration).split('.')[0].lstrip('0')
156
  srt_file.write(f"Speaker {i} (originally {speaker}): total duration {duration_str}\n")
157
 
158
-
159
  def process_video(video_path, hf_token, language, max_speakers=3):
160
  base_name = os.path.splitext(video_path)[0]
161
  audio_path = f"{base_name}.wav"
@@ -183,4 +140,4 @@ def process_video(video_path, hf_token, language, max_speakers=3):
183
  torch.cuda.empty_cache()
184
  gc.collect()
185
 
186
- return combined_srt_path
 
4
  import math
5
  from moviepy.editor import VideoFileClip
6
  from pyannote.audio import Pipeline
 
7
  import librosa
8
  import soundfile as sf
9
  import datetime
10
  from collections import defaultdict
11
  import numpy as np
12
+ import openai
13
+ from config import openai_api_key
14
+
15
+ openai.api_key = openai_api_key
16
 
17
  class LazyDiarizationPipeline:
18
  def __init__(self):
 
28
  gc.collect()
29
  return self.pipeline
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  lazy_diarization_pipeline = LazyDiarizationPipeline()
 
32
 
33
  def extract_audio(video_path, audio_path):
34
  video = VideoFileClip(video_path)
35
  audio = video.audio
36
  audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000)
37
 
 
38
  def format_timestamp(seconds):
39
  return str(datetime.timedelta(seconds=seconds)).split('.')[0]
40
 
 
41
  def transcribe_audio(audio_path, language):
42
+ with open(audio_path, "rb") as audio_file:
43
+ transcript = openai.Audio.transcribe(
44
+ file=audio_file,
45
+ model="whisper-1",
46
+ language=language,
47
+ response_format="verbose_json"
48
+ )
49
+
50
+ transcription_txt = transcript["text"]
51
  transcription_chunks = []
52
 
53
+ for segment in transcript["segments"]:
54
+ transcription_chunks.append({
55
+ "start": segment["start"],
56
+ "end": segment["end"],
57
+ "text": segment["text"]
58
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  return transcription_txt, transcription_chunks
61
 
 
62
  def diarize_audio(audio_path, pipeline, max_speakers):
63
  # Load the entire audio file
64
  audio, sr = librosa.load(audio_path, sr=16000)
 
77
 
78
  return diarization
79
 
 
80
  def create_combined_srt(transcription_chunks, diarization, output_path, max_speakers):
81
  speaker_segments = []
82
  speaker_durations = defaultdict(float)
 
113
  duration_str = format_timestamp(duration).split('.')[0].lstrip('0')
114
  srt_file.write(f"Speaker {i} (originally {speaker}): total duration {duration_str}\n")
115
 
 
116
  def process_video(video_path, hf_token, language, max_speakers=3):
117
  base_name = os.path.splitext(video_path)[0]
118
  audio_path = f"{base_name}.wav"
 
140
  torch.cuda.empty_cache()
141
  gc.collect()
142
 
143
+ return combined_srt_path