Spaces:
Running
Running
import os | |
import re | |
import google.generativeai as genai | |
from moviepy.video.io.VideoFileClip import VideoFileClip | |
import tempfile | |
import logging | |
import gradio as gr | |
from datetime import timedelta | |
# Suppress moviepy logs | |
logging.getLogger("moviepy").setLevel(logging.ERROR) | |
# Configure Gemini API | |
genai.configure(api_key=os.environ["GEMINI_API_KEY"]) | |
# Create the Gemini model | |
model = genai.GenerativeModel("gemini-2.0-flash-exp") | |
# Enhanced language support | |
SUPPORTED_LANGUAGES = [ | |
"Auto Detect", "English", "Spanish", "French", "German", "Italian", | |
"Portuguese", "Russian", "Japanese", "Korean", "Arabic", "Hindi", | |
"Chinese", "Dutch", "Turkish", "Polish", "Vietnamese", "Thai" | |
] | |
# Magic Prompts | |
TRANSCRIPTION_PROMPT = """You are a professional subtitling expert. Analyze this audio and generate precise subtitles with accurate timestamps following these rules: | |
1. Identify natural speech segments (3-7 words) | |
2. Include exact start/end times in [HH:MM:SS.ms] format | |
3. Add speaker identification when multiple voices | |
4. Preserve emotional tone and punctuation | |
5. Format exactly like: | |
[00:00:05.250 -> 00:00:08.100] | |
Hello world! This is an example. | |
[00:00:08.500 -> 00:00:10.200] | |
Second subtitle line. | |
Return ONLY the subtitles with timestamps, no explanations.""" | |
TRANSLATION_PROMPT = """You are a certified translator. Translate these subtitles to {target_language} following these rules: | |
1. Keep timestamps EXACTLY as original | |
2. Match subtitle length to original timing | |
3. Preserve names/technical terms | |
4. Use natural colloquial speech | |
5. Maintain line breaks and formatting | |
ORIGINAL SUBTITLES: | |
{subtitles} | |
TRANSLATED {target_language} SUBTITLES:""" | |
def extract_audio(video_path): | |
"""Extract high-quality audio from video""" | |
video = VideoFileClip(video_path) | |
audio_path = os.path.join(tempfile.gettempdir(), "high_quality_audio.wav") | |
video.audio.write_audiofile(audio_path, fps=44100, nbytes=2, codec='pcm_s16le') | |
return audio_path | |
def parse_timestamp(timestamp_str): | |
"""Convert timestamp string to seconds""" | |
h, m, s = map(float, timestamp_str.split(':')) | |
return h * 3600 + m * 60 + s | |
def gemini_transcribe(audio_path): | |
"""Get timestamped transcription from Gemini""" | |
with open(audio_path, "rb") as f: | |
audio_data = f.read() | |
response = model.generate_content( | |
contents=[TRANSCRIPTION_PROMPT, | |
{'mime_type': 'audio/wav', 'data': audio_data}] | |
) | |
return response.text | |
def create_srt(subtitles_text): | |
"""Convert Gemini's raw output to SRT format""" | |
entries = re.split(r'\n{2,}', subtitles_text.strip()) | |
srt_output = [] | |
for idx, entry in enumerate(entries, 1): | |
time_match = re.match(r'\[(.*?) -> (.*?)\]', entry) | |
if not time_match: | |
continue | |
start_time = parse_timestamp(time_match.group(1)) | |
end_time = parse_timestamp(time_match.group(2)) | |
text = entry.split(']', 1)[1].strip() | |
srt_output.append( | |
f"{idx}\n" | |
f"{timedelta(seconds=start_time)} --> {timedelta(seconds=end_time)}\n" | |
f"{text}\n" | |
) | |
return "".join(srt_output) | |
def translate_subtitles(subtitles, target_lang): | |
"""Translate subtitles while preserving timing""" | |
prompt = TRANSLATION_PROMPT.format( | |
target_language=target_lang, | |
subtitles=subtitles | |
) | |
response = model.generate_content(prompt) | |
return response.text | |
def process_video(video_path, source_lang, target_lang): | |
"""Full processing pipeline""" | |
# Audio extraction | |
audio_path = extract_audio(video_path) | |
# Transcription | |
raw_transcription = gemini_transcribe(audio_path) | |
srt_original = create_srt(raw_transcription) | |
# Save original | |
original_srt = os.path.join(tempfile.gettempdir(), "original.srt") | |
with open(original_srt, "w") as f: | |
f.write(srt_original) | |
# Translation | |
translated_srt = None | |
if target_lang != "None": | |
translated_text = translate_subtitles(srt_original, target_lang) | |
translated_srt = os.path.join(tempfile.gettempdir(), "translated.srt") | |
with open(translated_srt, "w") as f: | |
f.write(translated_text) | |
# Cleanup | |
os.remove(audio_path) | |
return original_srt, translated_srt | |
# Gradio Interface | |
with gr.Blocks(theme=gr.themes.Default(spacing_size="sm")) as app: | |
gr.Markdown("# 🎬 Professional Subtitle Studio") | |
gr.Markdown("Generate broadcast-quality subtitles with perfect timing") | |
with gr.Row(): | |
with gr.Column(): | |
video_input = gr.Video(label="Upload Video", sources=["upload"]) | |
lang_row = gr.Row() | |
source_lang = gr.Dropdown( | |
label="Source Language", | |
choices=SUPPORTED_LANGUAGES, | |
value="Auto Detect" | |
) | |
target_lang = gr.Dropdown( | |
label="Translate To", | |
choices=["None"] + SUPPORTED_LANGUAGES[1:], | |
value="None" | |
) | |
process_btn = gr.Button("Generate Subtitles", variant="primary") | |
with gr.Column(): | |
original_sub = gr.File(label="Original Subtitles") | |
translated_sub = gr.File(label="Translated Subtitles") | |
preview_area = gr.HTML(""" | |
<div style='border: 2px dashed #666; padding: 20px; border-radius: 8px;'> | |
<h3 style='margin-top: 0;'>Subtitle Preview</h3> | |
<div id='preview-content' style='height: 300px; overflow-y: auto;'></div> | |
</div> | |
""") | |
process_btn.click( | |
process_video, | |
inputs=[video_input, source_lang, target_lang], | |
outputs=[original_sub, translated_sub] | |
) | |
if __name__ == "__main__": | |
app.launch(server_port=7860, share=True) |