reab5555 commited on
Commit
fb650ff
·
verified ·
1 Parent(s): c3e3bb4

Update transcription_diarization.py

Browse files
Files changed (1) hide show
  1. transcription_diarization.py +32 -10
transcription_diarization.py CHANGED
@@ -9,7 +9,7 @@ import datetime
9
  from collections import defaultdict
10
  import numpy as np
11
  from openai import OpenAI
12
- from config import openai_api_key
13
  import json
14
  from multiprocessing import Pool, cpu_count
15
  from functools import partial
@@ -66,6 +66,31 @@ def diarize_audio(audio_path, pipeline, max_speakers):
66
 
67
  return diarization
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def create_combined_srt(transcription_chunks, diarization, output_path, max_speakers):
70
  speaker_segments = []
71
  speaker_durations = defaultdict(float)
@@ -99,8 +124,8 @@ def create_combined_srt(transcription_chunks, diarization, output_path, max_spea
99
  if new_speaker != current_speaker or (end_time - current_start > 10): # 10 seconds max duration
100
  if current_text:
101
  entry_count += 1
102
- start_str = format_timestamp(current_start).zfill(8)
103
- end_str = format_timestamp(current_end).zfill(8)
104
  srt_file.write(f"[{entry_count}. {current_speaker} | time: ({start_str} --> {end_str}) | text: {current_text.strip()}]\n\n")
105
 
106
  current_speaker = new_speaker
@@ -114,19 +139,16 @@ def create_combined_srt(transcription_chunks, diarization, output_path, max_spea
114
  # Write the last entry
115
  if current_text:
116
  entry_count += 1
117
- start_str = format_timestamp(current_start).zfill(8)
118
- end_str = format_timestamp(current_end).zfill(8)
119
  srt_file.write(f"[{entry_count}. {current_speaker} | time: ({start_str} --> {end_str}) | text: {current_text.strip()}]\n\n")
120
 
121
  with open(output_path, 'a', encoding='utf-8') as srt_file:
122
  for i, (speaker, duration) in enumerate(sorted_speakers, start=1):
123
- duration_str = format_timestamp(duration).zfill(8)
124
  srt_file.write(f"Speaker {i} (originally {speaker}): total duration {duration_str}\n")
125
 
126
- def format_timestamp(seconds):
127
- return str(datetime.timedelta(seconds=round(seconds))).zfill(8)
128
-
129
- def process_video(video_path, hf_token, language, max_speakers=3):
130
  base_name = os.path.splitext(video_path)[0]
131
  audio_path = f"{base_name}.wav"
132
  extract_audio(video_path, audio_path)
 
9
  from collections import defaultdict
10
  import numpy as np
11
  from openai import OpenAI
12
+ from config import openai_api_key, hf_token
13
  import json
14
  from multiprocessing import Pool, cpu_count
15
  from functools import partial
 
66
 
67
  return diarization
68
 
69
+ def transcribe_audio(audio_path, language):
70
+ with open(audio_path, "rb") as audio_file:
71
+ transcript = client.audio.transcriptions.create(
72
+ file=audio_file,
73
+ model="whisper-1",
74
+ language=language,
75
+ response_format="verbose_json"
76
+ )
77
+
78
+ # Convert the response to a dictionary if it's not already
79
+ if not isinstance(transcript, dict):
80
+ transcript = transcript.model_dump()
81
+
82
+ transcription_txt = transcript.get("text", "")
83
+ transcription_chunks = []
84
+
85
+ for segment in transcript.get("segments", []):
86
+ transcription_chunks.append({
87
+ "start": segment.get("start", 0),
88
+ "end": segment.get("end", 0),
89
+ "text": segment.get("text", "")
90
+ })
91
+
92
+ return transcription_txt, transcription_chunks
93
+
94
  def create_combined_srt(transcription_chunks, diarization, output_path, max_speakers):
95
  speaker_segments = []
96
  speaker_durations = defaultdict(float)
 
124
  if new_speaker != current_speaker or (end_time - current_start > 10): # 10 seconds max duration
125
  if current_text:
126
  entry_count += 1
127
+ start_str = format_timestamp(current_start)
128
+ end_str = format_timestamp(current_end)
129
  srt_file.write(f"[{entry_count}. {current_speaker} | time: ({start_str} --> {end_str}) | text: {current_text.strip()}]\n\n")
130
 
131
  current_speaker = new_speaker
 
139
  # Write the last entry
140
  if current_text:
141
  entry_count += 1
142
+ start_str = format_timestamp(current_start)
143
+ end_str = format_timestamp(current_end)
144
  srt_file.write(f"[{entry_count}. {current_speaker} | time: ({start_str} --> {end_str}) | text: {current_text.strip()}]\n\n")
145
 
146
  with open(output_path, 'a', encoding='utf-8') as srt_file:
147
  for i, (speaker, duration) in enumerate(sorted_speakers, start=1):
148
+ duration_str = format_timestamp(duration)
149
  srt_file.write(f"Speaker {i} (originally {speaker}): total duration {duration_str}\n")
150
 
151
+ def process_video(video_path, language, max_speakers=3):
 
 
 
152
  base_name = os.path.splitext(video_path)[0]
153
  audio_path = f"{base_name}.wav"
154
  extract_audio(video_path, audio_path)