Spaces:

Athspi-ai
/

AutoSubGen

Running

File size: 5,871 Bytes

import os
import re
import google.generativeai as genai
from moviepy.video.io.VideoFileClip import VideoFileClip
import tempfile
import logging
import gradio as gr
from datetime import timedelta

# Suppress moviepy logs
logging.getLogger("moviepy").setLevel(logging.ERROR)

# Configure Gemini API
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Create the Gemini model
model = genai.GenerativeModel("gemini-2.0-flash-exp")

# Enhanced language support
SUPPORTED_LANGUAGES = [
    "Auto Detect", "English", "Spanish", "French", "German", "Italian",
    "Portuguese", "Russian", "Japanese", "Korean", "Arabic", "Hindi",
    "Chinese", "Dutch", "Turkish", "Polish", "Vietnamese", "Thai"
]

# Magic Prompts
TRANSCRIPTION_PROMPT = """You are a professional subtitling expert. Analyze this audio and generate precise subtitles with accurate timestamps following these rules:

1. Identify natural speech segments (3-7 words)
2. Include exact start/end times in [HH:MM:SS.ms] format
3. Add speaker identification when multiple voices
4. Preserve emotional tone and punctuation
5. Format exactly like:

[00:00:05.250 -> 00:00:08.100]
Hello world! This is an example.

[00:00:08.500 -> 00:00:10.200]
Second subtitle line.

Return ONLY the subtitles with timestamps, no explanations."""

TRANSLATION_PROMPT = """You are a certified translator. Translate these subtitles to {target_language} following these rules:

1. Keep timestamps EXACTLY as original
2. Match subtitle length to original timing
3. Preserve names/technical terms
4. Use natural colloquial speech
5. Maintain line breaks and formatting

ORIGINAL SUBTITLES:
{subtitles}

TRANSLATED {target_language} SUBTITLES:"""

def extract_audio(video_path):
    """Extract high-quality audio from video"""
    video = VideoFileClip(video_path)
    audio_path = os.path.join(tempfile.gettempdir(), "high_quality_audio.wav")
    video.audio.write_audiofile(audio_path, fps=44100, nbytes=2, codec='pcm_s16le')
    return audio_path

def parse_timestamp(timestamp_str):
    """Convert timestamp string to seconds"""
    h, m, s = map(float, timestamp_str.split(':'))
    return h * 3600 + m * 60 + s

def gemini_transcribe(audio_path):
    """Get timestamped transcription from Gemini"""
    with open(audio_path, "rb") as f:
        audio_data = f.read()
    
    response = model.generate_content(
        contents=[TRANSCRIPTION_PROMPT, 
                 {'mime_type': 'audio/wav', 'data': audio_data}]
    )
    return response.text

def create_srt(subtitles_text):
    """Convert Gemini's raw output to SRT format"""
    entries = re.split(r'\n{2,}', subtitles_text.strip())
    srt_output = []
    
    for idx, entry in enumerate(entries, 1):
        time_match = re.match(r'\[(.*?) -> (.*?)\]', entry)
        if not time_match:
            continue
            
        start_time = parse_timestamp(time_match.group(1))
        end_time = parse_timestamp(time_match.group(2))
        text = entry.split(']', 1)[1].strip()
        
        srt_output.append(
            f"{idx}\n"
            f"{timedelta(seconds=start_time)} --> {timedelta(seconds=end_time)}\n"
            f"{text}\n"
        )
    
    return "".join(srt_output)

def translate_subtitles(subtitles, target_lang):
    """Translate subtitles while preserving timing"""
    prompt = TRANSLATION_PROMPT.format(
        target_language=target_lang,
        subtitles=subtitles
    )
    response = model.generate_content(prompt)
    return response.text

def process_video(video_path, source_lang, target_lang):
    """Full processing pipeline"""
    # Audio extraction
    audio_path = extract_audio(video_path)
    
    # Transcription
    raw_transcription = gemini_transcribe(audio_path)
    srt_original = create_srt(raw_transcription)
    
    # Save original
    original_srt = os.path.join(tempfile.gettempdir(), "original.srt")
    with open(original_srt, "w") as f:
        f.write(srt_original)
    
    # Translation
    translated_srt = None
    if target_lang != "None":
        translated_text = translate_subtitles(srt_original, target_lang)
        translated_srt = os.path.join(tempfile.gettempdir(), "translated.srt")
        with open(translated_srt, "w") as f:
            f.write(translated_text)
    
    # Cleanup
    os.remove(audio_path)
    
    return original_srt, translated_srt

# Gradio Interface
with gr.Blocks(theme=gr.themes.Default(spacing_size="sm")) as app:
    gr.Markdown("# 🎬 Professional Subtitle Studio")
    gr.Markdown("Generate broadcast-quality subtitles with perfect timing")
    
    with gr.Row():
        with gr.Column():
            video_input = gr.Video(label="Upload Video", sources=["upload"])
            lang_row = gr.Row()
            source_lang = gr.Dropdown(
                label="Source Language",
                choices=SUPPORTED_LANGUAGES,
                value="Auto Detect"
            )
            target_lang = gr.Dropdown(
                label="Translate To",
                choices=["None"] + SUPPORTED_LANGUAGES[1:],
                value="None"
            )
            process_btn = gr.Button("Generate Subtitles", variant="primary")
        
        with gr.Column():
            original_sub = gr.File(label="Original Subtitles")
            translated_sub = gr.File(label="Translated Subtitles")
            preview_area = gr.HTML("""
                <div style='border: 2px dashed #666; padding: 20px; border-radius: 8px;'>
                    <h3 style='margin-top: 0;'>Subtitle Preview</h3>
                    <div id='preview-content' style='height: 300px; overflow-y: auto;'></div>
                </div>
            """)
    
    process_btn.click(
        process_video,
        inputs=[video_input, source_lang, target_lang],
        outputs=[original_sub, translated_sub]
    )

if __name__ == "__main__":
    app.launch(server_port=7860, share=True)