Spaces:
Sleeping
Sleeping
import gradio as gr | |
import whisper | |
import torch | |
from transformers import pipeline | |
import tempfile | |
import os | |
import subprocess | |
import logging | |
from typing import Optional, Tuple | |
import re | |
import warnings | |
warnings.filterwarnings("ignore") | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class SubtitleTranslator: | |
def __init__(self): | |
# Use the smallest Whisper model for speed | |
self.whisper_model = None | |
self.translator = None | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
logger.info(f"Using device: {self.device}") | |
def load_models(self): | |
"""Load models lazily to save memory""" | |
if self.whisper_model is None: | |
logger.info("Loading Whisper model...") | |
self.whisper_model = whisper.load_model("base", device=self.device) | |
if self.translator is None: | |
logger.info("Loading translation model...") | |
# Use a lightweight translation model | |
try: | |
self.translator = pipeline( | |
"translation", | |
model="Helsinki-NLP/opus-mt-mul-en", | |
device=0 if self.device == "cuda" else -1 | |
) | |
except Exception as e: | |
logger.warning(f"Failed to load Helsinki model, using Facebook model: {e}") | |
self.translator = pipeline( | |
"translation", | |
model="facebook/m2m100_418M", | |
device=0 if self.device == "cuda" else -1 | |
) | |
def extract_audio(self, video_path: str) -> str: | |
"""Extract audio from video file""" | |
audio_path = tempfile.mktemp(suffix=".wav") | |
try: | |
# Use ffmpeg to extract audio - works with any video format/size | |
cmd = [ | |
"ffmpeg", "-i", video_path, | |
"-vn", "-acodec", "pcm_s16le", | |
"-ar", "16000", "-ac", "1", | |
audio_path, "-y" | |
] | |
subprocess.run(cmd, check=True, capture_output=True) | |
logger.info(f"Audio extracted to: {audio_path}") | |
return audio_path | |
except subprocess.CalledProcessError as e: | |
logger.error(f"Audio extraction failed: {e}") | |
raise Exception("Failed to extract audio from video") | |
def transcribe_audio(self, audio_path: str) -> dict: | |
"""Transcribe audio using Whisper""" | |
try: | |
logger.info("Starting transcription...") | |
result = self.whisper_model.transcribe( | |
audio_path, | |
task="transcribe", | |
fp16=self.device == "cuda" | |
) | |
logger.info("Transcription completed") | |
return result | |
except Exception as e: | |
logger.error(f"Transcription failed: {e}") | |
raise Exception("Failed to transcribe audio") | |
def translate_text(self, text: str, source_lang: str = None) -> str: | |
"""Translate text to English""" | |
if not text.strip(): | |
return "" | |
try: | |
# If already in English, return as is | |
if source_lang == "en": | |
return text | |
# For Helsinki model, use direct translation | |
if "Helsinki" in str(type(self.translator.model)): | |
result = self.translator(text) | |
return result[0]['translation_text'] if result else text | |
# For M2M100 model, specify target language | |
else: | |
result = self.translator(text, forced_bos_token_id=self.translator.tokenizer.get_lang_id("en")) | |
return result[0]['translation_text'] if result else text | |
except Exception as e: | |
logger.error(f"Translation failed: {e}") | |
return text # Return original if translation fails | |
def format_time(self, seconds: float) -> str: | |
"""Format time for SRT subtitle format""" | |
hours = int(seconds // 3600) | |
minutes = int((seconds % 3600) // 60) | |
secs = seconds % 60 | |
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',') | |
def create_srt(self, segments: list, translated: bool = False) -> str: | |
"""Create SRT subtitle format""" | |
srt_content = "" | |
for i, segment in enumerate(segments, 1): | |
start_time = self.format_time(segment['start']) | |
end_time = self.format_time(segment['end']) | |
text = segment.get('translated_text', segment['text']) if translated else segment['text'] | |
srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n" | |
return srt_content | |
def process_video(self, video_path: str, translate: bool = True) -> Tuple[str, str, str]: | |
"""Main processing function""" | |
try: | |
# Load models | |
self.load_models() | |
# Extract audio | |
audio_path = self.extract_audio(video_path) | |
try: | |
# Transcribe | |
result = self.transcribe_audio(audio_path) | |
detected_language = result.get('language', 'unknown') | |
# Process segments | |
segments = result['segments'] | |
if translate and detected_language != 'en': | |
logger.info(f"Translating from {detected_language} to English...") | |
for segment in segments: | |
segment['translated_text'] = self.translate_text( | |
segment['text'], detected_language | |
) | |
# Create subtitle files | |
original_srt = self.create_srt(segments, translated=False) | |
translated_srt = self.create_srt(segments, translated=True) if translate else "" | |
# Save to temporary files | |
original_file = tempfile.mktemp(suffix=".srt") | |
with open(original_file, 'w', encoding='utf-8') as f: | |
f.write(original_srt) | |
translated_file = None | |
if translate and detected_language != 'en': | |
translated_file = tempfile.mktemp(suffix=".srt") | |
with open(translated_file, 'w', encoding='utf-8') as f: | |
f.write(translated_srt) | |
return original_file, translated_file, f"Detected language: {detected_language}" | |
finally: | |
# Clean up audio file | |
if os.path.exists(audio_path): | |
os.unlink(audio_path) | |
except Exception as e: | |
logger.error(f"Processing failed: {e}") | |
raise gr.Error(f"Processing failed: {str(e)}") | |
# Initialize the translator | |
translator = SubtitleTranslator() | |
def process_video_interface(video_file, translate_option): | |
"""Gradio interface function""" | |
if video_file is None: | |
raise gr.Error("Please upload a video file") | |
translate = translate_option == "Yes" | |
try: | |
original_srt, translated_srt, info = translator.process_video(video_file, translate) | |
outputs = [original_srt, info] | |
if translated_srt: | |
outputs.append(translated_srt) | |
return outputs[0], outputs[1], outputs[2] | |
else: | |
return outputs[0], outputs[1], None | |
except Exception as e: | |
raise gr.Error(f"Error processing video: {str(e)}") | |
# Create Gradio interface | |
def create_interface(): | |
with gr.Blocks( | |
title="Video Subtitle Translator", | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container {max-width: 1000px; margin: auto;} | |
.subtitle-info {background: #f0f8ff; padding: 15px; border-radius: 10px; margin: 10px 0;} | |
""" | |
) as demo: | |
gr.HTML(""" | |
<div style="text-align: center; padding: 20px;"> | |
<h1>π¬ Video Subtitle Translator</h1> | |
<p>Generate and translate subtitles for any video - No size or duration limits!</p> | |
<p><em>Supports all video formats β’ Automatic language detection β’ Fast processing</em></p> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
video_input = gr.File( | |
label="Upload Video File", | |
file_types=["video"], | |
type="filepath" | |
) | |
translate_option = gr.Radio( | |
choices=["Yes", "No"], | |
value="Yes", | |
label="Translate to English?", | |
info="Choose 'No' if you only want transcription in original language" | |
) | |
process_btn = gr.Button( | |
"π Generate Subtitles", | |
variant="primary", | |
size="lg" | |
) | |
with gr.Column(scale=3): | |
info_output = gr.Textbox( | |
label="Processing Info", | |
interactive=False, | |
elem_classes=["subtitle-info"] | |
) | |
original_output = gr.File( | |
label="π Original Subtitles (.srt)", | |
interactive=False | |
) | |
translated_output = gr.File( | |
label="π English Translated Subtitles (.srt)", | |
interactive=False, | |
visible=True | |
) | |
gr.HTML(""" | |
<div style="margin-top: 30px; padding: 20px; background: #f8f9fa; border-radius: 10px;"> | |
<h3>π Instructions:</h3> | |
<ol> | |
<li><strong>Upload any video file</strong> - MP4, AVI, MOV, MKV, etc.</li> | |
<li><strong>Choose translation option</strong> - Yes for English translation, No for original language only</li> | |
<li><strong>Click "Generate Subtitles"</strong> - Processing time depends on video length</li> | |
<li><strong>Download your subtitle files</strong> - Use them with any video player</li> | |
</ol> | |
<h3>β¨ Features:</h3> | |
<ul> | |
<li>π― <strong>No size limits</strong> - Process videos of any duration</li> | |
<li>π <strong>Auto language detection</strong> - Supports 50+ languages</li> | |
<li>β‘ <strong>Lightweight models</strong> - Fast processing on any hardware</li> | |
<li>π± <strong>Universal compatibility</strong> - Works with all video formats</li> | |
<li>π§ <strong>SRT format</strong> - Compatible with all media players</li> | |
</ul> | |
</div> | |
""") | |
# Set up the processing | |
process_btn.click( | |
fn=process_video_interface, | |
inputs=[video_input, translate_option], | |
outputs=[original_output, info_output, translated_output] | |
) | |
return demo | |
# Launch the app | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch(share=True) |