Spaces:

Nick021402
/

SubGen

Sleeping

File size: 10,475 Bytes

8a5a458

# app.py - Main Gradio application
import gradio as gr
import whisper
import torch
from transformers import MarianMTModel, MarianTokenizer
import yt_dlp
import os
import tempfile
import subprocess
from pathlib import Path
import re

class SubtitleTranslator:
    def __init__(self):
        # Use the smallest Whisper model for speed
        self.whisper_model = whisper.load_model("tiny")
        
        # Translation model cache
        self.translation_models = {}
        self.tokenizers = {}
        
    def download_youtube_audio(self, url):
        """Download audio from YouTube video"""
        try:
            ydl_opts = {
                'format': 'bestaudio/best',
                'outtmpl': 'temp_audio.%(ext)s',
                'postprocessors': [{
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'mp3',
                    'preferredquality': '192',
                }],
            }
            
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
            
            # Find the downloaded file
            for file in os.listdir('.'):
                if file.startswith('temp_audio') and file.endswith('.mp3'):
                    return file
            return None
        except Exception as e:
            return None
    
    def extract_audio_from_video(self, video_path):
        """Extract audio from uploaded video file"""
        try:
            audio_path = "temp_extracted_audio.wav"
            cmd = [
                'ffmpeg', '-i', video_path, 
                '-acodec', 'pcm_s16le', 
                '-ac', '1', 
                '-ar', '16000',
                audio_path, '-y'
            ]
            subprocess.run(cmd, check=True, capture_output=True)
            return audio_path
        except Exception as e:
            return None
    
    def transcribe_audio(self, audio_path):
        """Transcribe audio using Whisper"""
        result = self.whisper_model.transcribe(audio_path)
        return result
    
    def get_translation_model(self, source_lang, target_lang="en"):
        """Load translation model for language pair"""
        model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
        
        try:
            if model_name not in self.translation_models:
                self.tokenizers[model_name] = MarianTokenizer.from_pretrained(model_name)
                self.translation_models[model_name] = MarianMTModel.from_pretrained(model_name)
            
            return self.translation_models[model_name], self.tokenizers[model_name]
        except:
            # Fallback to multilingual model
            fallback_model = "Helsinki-NLP/opus-mt-mul-en"
            if fallback_model not in self.translation_models:
                self.tokenizers[fallback_model] = MarianTokenizer.from_pretrained(fallback_model)
                self.translation_models[fallback_model] = MarianMTModel.from_pretrained(fallback_model)
            return self.translation_models[fallback_model], self.tokenizers[fallback_model]
    
    def translate_text(self, text, source_lang, target_lang="en"):
        """Translate text using MarianMT"""
        if source_lang == target_lang:
            return text
        
        try:
            model, tokenizer = self.get_translation_model(source_lang, target_lang)
            inputs = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=512)
            translated = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
            return tokenizer.decode(translated[0], skip_special_tokens=True)
        except:
            return text  # Return original if translation fails
    
    def format_timestamp(self, seconds):
        """Convert seconds to SRT timestamp format"""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = int(seconds % 60)
        millisecs = int((seconds % 1) * 1000)
        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}"
    
    def create_srt(self, segments, source_lang):
        """Create SRT subtitle content"""
        srt_content = ""
        
        for i, segment in enumerate(segments, 1):
            start_time = self.format_timestamp(segment['start'])
            end_time = self.format_timestamp(segment['end'])
            
            original_text = segment['text'].strip()
            translated_text = self.translate_text(original_text, source_lang, "en")
            
            srt_content += f"{i}\n"
            srt_content += f"{start_time} --> {end_time}\n"
            srt_content += f"{translated_text}\n\n"
        
        return srt_content
    
    def process_video(self, video_input, youtube_url):
        """Main processing function"""
        try:
            # Determine input source
            if youtube_url and youtube_url.strip():
                audio_path = self.download_youtube_audio(youtube_url.strip())
                if not audio_path:
                    return "Error: Could not download YouTube video", None
            elif video_input:
                audio_path = self.extract_audio_from_video(video_input)
                if not audio_path:
                    return "Error: Could not extract audio from video", None
            else:
                return "Please provide either a video file or YouTube URL", None
            
            # Transcribe audio
            result = self.transcribe_audio(audio_path)
            
            # Detect language
            detected_lang = result.get('language', 'unknown')
            
            # Language code mapping for translation models
            lang_mapping = {
                'spanish': 'es', 'french': 'fr', 'german': 'de', 'italian': 'it',
                'portuguese': 'pt', 'russian': 'ru', 'chinese': 'zh', 'japanese': 'ja',
                'korean': 'ko', 'arabic': 'ar', 'hindi': 'hi', 'dutch': 'nl',
                'swedish': 'sv', 'norwegian': 'no', 'danish': 'da', 'finnish': 'fi'
            }
            
            source_lang_code = lang_mapping.get(detected_lang, detected_lang)
            
            # Create SRT content
            srt_content = self.create_srt(result['segments'], source_lang_code)
            
            # Save SRT file
            srt_filename = "translated_subtitles.srt"
            with open(srt_filename, 'w', encoding='utf-8') as f:
                f.write(srt_content)
            
            # Clean up temporary files
            if os.path.exists(audio_path):
                os.remove(audio_path)
            
            status_msg = f"✅ Processing complete!\n"
            status_msg += f"🔍 Detected language: {detected_lang}\n"
            status_msg += f"📝 Generated {len(result['segments'])} subtitle segments\n"
            status_msg += f"🌍 Translated to English"
            
            return status_msg, srt_filename
            
        except Exception as e:
            return f"Error during processing: {str(e)}", None

# Initialize the translator
translator = SubtitleTranslator()

# Create Gradio interface
def process_video_interface(video_file, youtube_url, progress=gr.Progress()):
    progress(0.1, desc="Starting processing...")
    
    progress(0.3, desc="Extracting audio...")
    result = translator.process_video(video_file, youtube_url)
    
    progress(0.7, desc="Transcribing and translating...")
    progress(1.0, desc="Complete!")
    
    return result

# Custom CSS for better UI
css = """
.gradio-container {
    max-width: 900px !important;
}
.title {
    text-align: center;
    color: #2563eb;
    font-size: 2.5rem;
    font-weight: bold;
    margin-bottom: 1rem;
}
.subtitle {
    text-align: center;
    color: #64748b;
    font-size: 1.2rem;
    margin-bottom: 2rem;
}
.feature-box {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 1rem;
    border-radius: 10px;
    margin: 1rem 0;
}
"""

# Create the Gradio app
with gr.Blocks(css=css, title="Video Subtitle Translator") as app:
    gr.HTML("""
    <div class="title">🎬 Video Subtitle Translator</div>
    <div class="subtitle">Generate English subtitles from any language video using AI</div>
    """)
    
    with gr.Row():
        with gr.Column():
            gr.HTML("""
            <div class="feature-box">
                <h3>🚀 Features:</h3>
                <ul>
                    <li>📹 Upload video files or paste YouTube links</li>
                    <li>🎯 Automatic speech recognition with Whisper AI</li>
                    <li>🌍 Auto-detect source language</li>
                    <li>📝 Generate accurate English subtitles</li>
                    <li>⏱️ Perfect timing synchronization</li>
                    <li>💾 Download ready-to-use SRT files</li>
                </ul>
            </div>
            """)
    
    with gr.Row():
        with gr.Column(scale=1):
            video_input = gr.File(
                label="📁 Upload Video File",
                file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm", ".m4v"],
                type="filepath"
            )
            
            youtube_input = gr.Textbox(
                label="🔗 Or paste YouTube URL",
                placeholder="https://www.youtube.com/watch?v=...",
                lines=1
            )
            
            process_btn = gr.Button(
                "🚀 Generate Subtitles",
                variant="primary",
                size="lg"
            )
        
        with gr.Column(scale=1):
            status_output = gr.Textbox(
                label="📊 Processing Status",
                lines=6,
                interactive=False
            )
            
            srt_output = gr.File(
                label="💾 Download SRT File",
                interactive=False
            )
    
    gr.HTML("""
    <div style="text-align: center; margin-top: 2rem; color: #64748b;">
        <p>⚡ Powered by Whisper AI & MarianMT | 🤗 Running on Hugging Face Spaces</p>
        <p>💡 Tip: For best results, use videos with clear audio and minimal background noise</p>
    </div>
    """)
    
    # Connect the processing function
    process_btn.click(
        fn=process_video_interface,
        inputs=[video_input, youtube_input],
        outputs=[status_output, srt_output],
        show_progress=True
    )

if __name__ == "__main__":
    app.launch()