|
import string |
|
import json |
|
import os |
|
|
|
import re |
|
import uuid |
|
from pydub import AudioSegment |
|
|
|
|
|
if not os.path.exists("./subtitles"): |
|
os.makedirs("./subtitles", exist_ok=True) |
|
|
|
def clean_file_name(file_path,unique_id=True): |
|
|
|
file_name = os.path.basename(file_path) |
|
file_name, file_extension = os.path.splitext(file_name) |
|
|
|
|
|
cleaned = re.sub(r'[^a-zA-Z\d]+', '_', file_name) |
|
|
|
|
|
clean_file_name = re.sub(r'_+', '_', cleaned).strip('_') |
|
|
|
|
|
random_uuid = uuid.uuid4().hex[:6] |
|
if unique_id: |
|
clean_file_name = f"{clean_file_name}_{random_uuid}{file_extension}" |
|
else: |
|
clean_file_name = f"{clean_file_name}{file_extension}" |
|
|
|
return clean_file_name |
|
|
|
def convert_to_mono(file_path, output_format="mp3"): |
|
|
|
audio = AudioSegment.from_file(file_path) |
|
|
|
|
|
mono_audio = audio.set_channels(1) |
|
|
|
file_name = os.path.basename(file_path) |
|
file_name, file_extension = os.path.splitext(file_name) |
|
|
|
|
|
cleaned_file_name = clean_file_name(file_name) |
|
output_file = f"./subtitles/{cleaned_file_name}.{output_format}" |
|
|
|
|
|
mono_audio.export(output_file, format=output_format) |
|
return output_file |
|
|
|
def format_srt_time(seconds): |
|
hours = int(seconds // 3600) |
|
minutes = int((seconds % 3600) // 60) |
|
sec = int(seconds % 60) |
|
millisec = int((seconds % 1) * 1000) |
|
return f"{hours:02}:{minutes:02}:{sec:02},{millisec:03}" |
|
|
|
|
|
def write_word_srt(mono_audio_path,word_level_timestamps, skip_punctuation=True): |
|
extension = os.path.splitext(mono_audio_path)[1] |
|
output_file=mono_audio_path.replace(extension,"_word_level.srt") |
|
with open(output_file, "w", encoding="utf-8") as f: |
|
index = 1 |
|
|
|
for entry in word_level_timestamps: |
|
word = entry["word"] |
|
|
|
if skip_punctuation and all(c in string.punctuation for c in word): |
|
continue |
|
|
|
start_srt = format_srt_time(entry["start"]) |
|
end_srt = format_srt_time(entry["end"]) |
|
|
|
f.write(f"{index}\n{start_srt} --> {end_srt}\n{word}\n\n") |
|
index += 1 |
|
return output_file |
|
|
|
|
|
|
|
def write_words_to_txt(mono_audio_path, word_level_timestamps): |
|
|
|
extension = os.path.splitext(mono_audio_path)[1] |
|
output_file=mono_audio_path.replace(extension,".txt") |
|
|
|
with open(output_file, "w", encoding="utf-8") as f: |
|
words = [ |
|
entry["word"] |
|
for entry in word_level_timestamps |
|
if not all(c in string.punctuation for c in entry["word"]) |
|
] |
|
text = " ".join(words) |
|
f.write(text) |
|
return text, output_file |
|
|
|
|
|
|
|
def generate_professional_subtitles(mono_audio_path, word_timestamps, max_words_per_subtitle=8, max_subtitle_duration=5.0, min_pause_for_split=0.5): |
|
""" |
|
Generates professional subtitles and saves to SRT file by: |
|
- Splitting at sentence boundaries (., ?, !) when possible |
|
- Respecting pauses (> min_pause_for_split) for natural breaks |
|
- Enforcing max_words_per_subtitle and max_subtitle_duration |
|
- Outputting standard SRT format with proper timing |
|
|
|
Returns: |
|
output_file: Path to the generated SRT file |
|
subtitles: List of subtitle dictionaries with text/start/end |
|
""" |
|
subtitles = [] |
|
current_sub = { |
|
"text": "", |
|
"start": None, |
|
"end": None, |
|
"word_count": 0 |
|
} |
|
|
|
|
|
extension = os.path.splitext(mono_audio_path)[1] |
|
output_file=mono_audio_path.replace(extension,".srt") |
|
|
|
|
|
|
|
for word_data in word_timestamps: |
|
word = word_data['word'] |
|
word_start = word_data['start'] |
|
word_end = word_data['end'] |
|
|
|
|
|
is_end_of_sentence = word.endswith(('.', '?', '!')) |
|
|
|
|
|
has_pause = (current_sub["end"] is not None and |
|
word_start - current_sub["end"] > min_pause_for_split) |
|
|
|
|
|
should_split = ( |
|
is_end_of_sentence or |
|
has_pause or |
|
current_sub["word_count"] >= max_words_per_subtitle or |
|
(current_sub["end"] is not None and |
|
(word_end - current_sub["start"]) > max_subtitle_duration) |
|
) |
|
|
|
if should_split and current_sub["text"]: |
|
|
|
subtitles.append({ |
|
"text": current_sub["text"].strip(), |
|
"start": current_sub["start"], |
|
"end": current_sub["end"] |
|
}) |
|
|
|
current_sub = { |
|
"text": "", |
|
"start": None, |
|
"end": None, |
|
"word_count": 0 |
|
} |
|
|
|
|
|
if current_sub["word_count"] == 0: |
|
current_sub["start"] = word_start |
|
current_sub["text"] += " " + word if current_sub["text"] else word |
|
current_sub["end"] = word_end |
|
current_sub["word_count"] += 1 |
|
|
|
|
|
if current_sub["text"]: |
|
subtitles.append({ |
|
"text": current_sub["text"].strip(), |
|
"start": current_sub["start"], |
|
"end": current_sub["end"] |
|
}) |
|
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f: |
|
for i, sub in enumerate(subtitles, 1): |
|
f.write(f"{i}\n") |
|
f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n") |
|
f.write(f"{sub['text']}\n\n") |
|
|
|
return output_file, subtitles |
|
|
|
|
|
|
|
def for_yt_shorts(mono_audio_path, word_timestamps, min_silence_between_words=0.3, max_characters_per_subtitle=17): |
|
""" |
|
Generates optimized subtitles for YouTube Shorts/Instagram Reels by: |
|
- Combining hyphenated words (e.g., "co-" + "-worker" → "coworker") |
|
- Respecting max character limits per subtitle (default: 17) |
|
- Creating natural breaks at pauses (> min_silence_between_words) |
|
- Outputting properly formatted SRT files |
|
|
|
Returns: |
|
output_file: Path to generated SRT file |
|
subtitles: List of subtitle dictionaries (text/start/end) |
|
""" |
|
subtitles = [] |
|
current_sub = { |
|
"text": "", |
|
"start": None, |
|
"end": None, |
|
"char_count": 0 |
|
} |
|
|
|
|
|
extension = os.path.splitext(mono_audio_path)[1] |
|
output_file=mono_audio_path.replace(extension,"_shorts.srt") |
|
|
|
i = 0 |
|
while i < len(word_timestamps): |
|
|
|
full_word = word_timestamps[i]['word'] |
|
start_time = word_timestamps[i]['start'] |
|
end_time = word_timestamps[i]['end'] |
|
|
|
|
|
while (i + 1 < len(word_timestamps) and |
|
word_timestamps[i+1]['word'].startswith('-')): |
|
next_word = word_timestamps[i+1]['word'].lstrip('-') |
|
full_word += next_word |
|
end_time = word_timestamps[i+1]['end'] |
|
i += 1 |
|
|
|
|
|
new_char_count = current_sub["char_count"] + len(full_word) + (1 if current_sub["text"] else 0) |
|
|
|
|
|
needs_break = ( |
|
new_char_count > max_characters_per_subtitle or |
|
(current_sub["end"] is not None and |
|
word_timestamps[i]['start'] - current_sub["end"] > min_silence_between_words) |
|
) |
|
|
|
if needs_break and current_sub["text"]: |
|
|
|
subtitles.append({ |
|
"text": current_sub["text"].strip(), |
|
"start": current_sub["start"], |
|
"end": current_sub["end"] |
|
}) |
|
|
|
current_sub = { |
|
"text": full_word, |
|
"start": start_time, |
|
"end": end_time, |
|
"char_count": len(full_word) |
|
} |
|
else: |
|
|
|
if current_sub["text"]: |
|
current_sub["text"] += " " + full_word |
|
current_sub["char_count"] += 1 + len(full_word) |
|
else: |
|
current_sub["text"] = full_word |
|
current_sub["start"] = start_time |
|
current_sub["char_count"] = len(full_word) |
|
current_sub["end"] = end_time |
|
|
|
i += 1 |
|
|
|
|
|
if current_sub["text"]: |
|
subtitles.append({ |
|
"text": current_sub["text"].strip(), |
|
"start": current_sub["start"], |
|
"end": current_sub["end"] |
|
}) |
|
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f: |
|
for idx, sub in enumerate(subtitles, 1): |
|
f.write(f"{idx}\n") |
|
f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n") |
|
f.write(f"{sub['text']}\n\n") |
|
|
|
return output_file, subtitles |
|
|
|
|
|
|
|
|
|
def word_timestamp_json(mono_audio_path, word_timestamps): |
|
""" |
|
Save word timestamps as a JSON file with the same base name as the audio file. |
|
|
|
Args: |
|
mono_audio_path: Path to the audio file (e.g., "audio.wav") |
|
word_timestamps: List of word timestamp dictionaries |
|
|
|
Returns: |
|
output_file: Path to the generated JSON file |
|
word_timestamps: The original word timestamps (unchanged) |
|
""" |
|
|
|
extension = os.path.splitext(mono_audio_path)[1] |
|
output_file=mono_audio_path.replace(extension,"_word_timestamps.json") |
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
json.dump(word_timestamps, f, indent=2, ensure_ascii=False) |
|
|
|
return output_file |
|
|
|
|
|
def save_files(mono_audio_path, word_timestamps): |
|
""" |
|
Processes word timestamps and generates multiple subtitle/text formats for different use cases. |
|
|
|
Generates: |
|
1. Professional SRT subtitles (for standard videos) |
|
2. Word-level SRT (for short-form content) |
|
3. Optimized vertical video subtitles (Shorts/Reels/TikTok) |
|
4. Raw speech-to-text transcript |
|
5. JSON timestamp data (for developers) |
|
6. Raw transcript text (for immediate use) |
|
|
|
Args: |
|
mono_audio_path: Path to the source audio file (WAV format) |
|
word_timestamps: List of dictionaries containing word-level timestamps |
|
[{'word': str, 'start': float, 'end': float}, ...] |
|
|
|
Returns: |
|
Six separate values in this order: |
|
default_srt_path: # Traditional subtitles (8 words max) |
|
word_level_srt_path: # Single-word segments |
|
shorts_srt_path: # Vertical video optimized |
|
speech_text_path: # Plain text transcript file |
|
timestamps_json_path: # Raw timestamp data file |
|
text: # Raw transcript text string |
|
""" |
|
|
|
|
|
default_srt_path, _ = generate_professional_subtitles( |
|
mono_audio_path, |
|
word_timestamps, |
|
max_words_per_subtitle=8, |
|
max_subtitle_duration=5.0, |
|
min_pause_for_split=0.5 |
|
) |
|
|
|
|
|
word_level_srt_path = write_word_srt(mono_audio_path, word_timestamps) |
|
|
|
|
|
shorts_srt_path, _ = for_yt_shorts( |
|
mono_audio_path, |
|
word_timestamps, |
|
min_silence_between_words=0.3, |
|
max_characters_per_subtitle=17 |
|
) |
|
|
|
|
|
text, speech_text_path = write_words_to_txt(mono_audio_path, word_timestamps) |
|
|
|
|
|
timestamps_json_path = word_timestamp_json(mono_audio_path, word_timestamps) |
|
|
|
|
|
return default_srt_path, word_level_srt_path, shorts_srt_path, speech_text_path, timestamps_json_path, text |