File size: 10,475 Bytes
8a5a458
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# app.py - Main Gradio application
import gradio as gr
import whisper
import torch
from transformers import MarianMTModel, MarianTokenizer
import yt_dlp
import os
import tempfile
import subprocess
from pathlib import Path
import re

class SubtitleTranslator:
    def __init__(self):
        # Use the smallest Whisper model for speed
        self.whisper_model = whisper.load_model("tiny")
        
        # Translation model cache
        self.translation_models = {}
        self.tokenizers = {}
        
    def download_youtube_audio(self, url):
        """Download audio from YouTube video"""
        try:
            ydl_opts = {
                'format': 'bestaudio/best',
                'outtmpl': 'temp_audio.%(ext)s',
                'postprocessors': [{
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'mp3',
                    'preferredquality': '192',
                }],
            }
            
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
            
            # Find the downloaded file
            for file in os.listdir('.'):
                if file.startswith('temp_audio') and file.endswith('.mp3'):
                    return file
            return None
        except Exception as e:
            return None
    
    def extract_audio_from_video(self, video_path):
        """Extract audio from uploaded video file"""
        try:
            audio_path = "temp_extracted_audio.wav"
            cmd = [
                'ffmpeg', '-i', video_path, 
                '-acodec', 'pcm_s16le', 
                '-ac', '1', 
                '-ar', '16000',
                audio_path, '-y'
            ]
            subprocess.run(cmd, check=True, capture_output=True)
            return audio_path
        except Exception as e:
            return None
    
    def transcribe_audio(self, audio_path):
        """Transcribe audio using Whisper"""
        result = self.whisper_model.transcribe(audio_path)
        return result
    
    def get_translation_model(self, source_lang, target_lang="en"):
        """Load translation model for language pair"""
        model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
        
        try:
            if model_name not in self.translation_models:
                self.tokenizers[model_name] = MarianTokenizer.from_pretrained(model_name)
                self.translation_models[model_name] = MarianMTModel.from_pretrained(model_name)
            
            return self.translation_models[model_name], self.tokenizers[model_name]
        except:
            # Fallback to multilingual model
            fallback_model = "Helsinki-NLP/opus-mt-mul-en"
            if fallback_model not in self.translation_models:
                self.tokenizers[fallback_model] = MarianTokenizer.from_pretrained(fallback_model)
                self.translation_models[fallback_model] = MarianMTModel.from_pretrained(fallback_model)
            return self.translation_models[fallback_model], self.tokenizers[fallback_model]
    
    def translate_text(self, text, source_lang, target_lang="en"):
        """Translate text using MarianMT"""
        if source_lang == target_lang:
            return text
        
        try:
            model, tokenizer = self.get_translation_model(source_lang, target_lang)
            inputs = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=512)
            translated = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
            return tokenizer.decode(translated[0], skip_special_tokens=True)
        except:
            return text  # Return original if translation fails
    
    def format_timestamp(self, seconds):
        """Convert seconds to SRT timestamp format"""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = int(seconds % 60)
        millisecs = int((seconds % 1) * 1000)
        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}"
    
    def create_srt(self, segments, source_lang):
        """Create SRT subtitle content"""
        srt_content = ""
        
        for i, segment in enumerate(segments, 1):
            start_time = self.format_timestamp(segment['start'])
            end_time = self.format_timestamp(segment['end'])
            
            original_text = segment['text'].strip()
            translated_text = self.translate_text(original_text, source_lang, "en")
            
            srt_content += f"{i}\n"
            srt_content += f"{start_time} --> {end_time}\n"
            srt_content += f"{translated_text}\n\n"
        
        return srt_content
    
    def process_video(self, video_input, youtube_url):
        """Main processing function"""
        try:
            # Determine input source
            if youtube_url and youtube_url.strip():
                audio_path = self.download_youtube_audio(youtube_url.strip())
                if not audio_path:
                    return "Error: Could not download YouTube video", None
            elif video_input:
                audio_path = self.extract_audio_from_video(video_input)
                if not audio_path:
                    return "Error: Could not extract audio from video", None
            else:
                return "Please provide either a video file or YouTube URL", None
            
            # Transcribe audio
            result = self.transcribe_audio(audio_path)
            
            # Detect language
            detected_lang = result.get('language', 'unknown')
            
            # Language code mapping for translation models
            lang_mapping = {
                'spanish': 'es', 'french': 'fr', 'german': 'de', 'italian': 'it',
                'portuguese': 'pt', 'russian': 'ru', 'chinese': 'zh', 'japanese': 'ja',
                'korean': 'ko', 'arabic': 'ar', 'hindi': 'hi', 'dutch': 'nl',
                'swedish': 'sv', 'norwegian': 'no', 'danish': 'da', 'finnish': 'fi'
            }
            
            source_lang_code = lang_mapping.get(detected_lang, detected_lang)
            
            # Create SRT content
            srt_content = self.create_srt(result['segments'], source_lang_code)
            
            # Save SRT file
            srt_filename = "translated_subtitles.srt"
            with open(srt_filename, 'w', encoding='utf-8') as f:
                f.write(srt_content)
            
            # Clean up temporary files
            if os.path.exists(audio_path):
                os.remove(audio_path)
            
            status_msg = f"βœ… Processing complete!\n"
            status_msg += f"πŸ” Detected language: {detected_lang}\n"
            status_msg += f"πŸ“ Generated {len(result['segments'])} subtitle segments\n"
            status_msg += f"🌍 Translated to English"
            
            return status_msg, srt_filename
            
        except Exception as e:
            return f"Error during processing: {str(e)}", None

# Initialize the translator
translator = SubtitleTranslator()

# Create Gradio interface
def process_video_interface(video_file, youtube_url, progress=gr.Progress()):
    progress(0.1, desc="Starting processing...")
    
    progress(0.3, desc="Extracting audio...")
    result = translator.process_video(video_file, youtube_url)
    
    progress(0.7, desc="Transcribing and translating...")
    progress(1.0, desc="Complete!")
    
    return result

# Custom CSS for better UI
css = """
.gradio-container {
    max-width: 900px !important;
}
.title {
    text-align: center;
    color: #2563eb;
    font-size: 2.5rem;
    font-weight: bold;
    margin-bottom: 1rem;
}
.subtitle {
    text-align: center;
    color: #64748b;
    font-size: 1.2rem;
    margin-bottom: 2rem;
}
.feature-box {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 1rem;
    border-radius: 10px;
    margin: 1rem 0;
}
"""

# Create the Gradio app
with gr.Blocks(css=css, title="Video Subtitle Translator") as app:
    gr.HTML("""
    <div class="title">🎬 Video Subtitle Translator</div>
    <div class="subtitle">Generate English subtitles from any language video using AI</div>
    """)
    
    with gr.Row():
        with gr.Column():
            gr.HTML("""
            <div class="feature-box">
                <h3>πŸš€ Features:</h3>
                <ul>
                    <li>πŸ“Ή Upload video files or paste YouTube links</li>
                    <li>🎯 Automatic speech recognition with Whisper AI</li>
                    <li>🌍 Auto-detect source language</li>
                    <li>πŸ“ Generate accurate English subtitles</li>
                    <li>⏱️ Perfect timing synchronization</li>
                    <li>πŸ’Ύ Download ready-to-use SRT files</li>
                </ul>
            </div>
            """)
    
    with gr.Row():
        with gr.Column(scale=1):
            video_input = gr.File(
                label="πŸ“ Upload Video File",
                file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm", ".m4v"],
                type="filepath"
            )
            
            youtube_input = gr.Textbox(
                label="πŸ”— Or paste YouTube URL",
                placeholder="https://www.youtube.com/watch?v=...",
                lines=1
            )
            
            process_btn = gr.Button(
                "πŸš€ Generate Subtitles",
                variant="primary",
                size="lg"
            )
        
        with gr.Column(scale=1):
            status_output = gr.Textbox(
                label="πŸ“Š Processing Status",
                lines=6,
                interactive=False
            )
            
            srt_output = gr.File(
                label="πŸ’Ύ Download SRT File",
                interactive=False
            )
    
    gr.HTML("""
    <div style="text-align: center; margin-top: 2rem; color: #64748b;">
        <p>⚑ Powered by Whisper AI & MarianMT | πŸ€— Running on Hugging Face Spaces</p>
        <p>πŸ’‘ Tip: For best results, use videos with clear audio and minimal background noise</p>
    </div>
    """)
    
    # Connect the processing function
    process_btn.click(
        fn=process_video_interface,
        inputs=[video_input, youtube_input],
        outputs=[status_output, srt_output],
        show_progress=True
    )

if __name__ == "__main__":
    app.launch()