File size: 2,590 Bytes
e002d92
7cb0b8e
e002d92
41ec54b
e002d92
7cb0b8e
e002d92
41ec54b
e002d92
41ec54b
 
9dc25d9
41ec54b
e002d92
41ec54b
 
 
 
 
e002d92
41ec54b
 
 
 
 
 
 
 
 
 
 
 
 
d4af723
41ec54b
 
e002d92
 
 
362c063
e002d92
 
9c9b591
 
e002d92
9c9b591
e002d92
362c063
e002d92
41ec54b
 
 
e002d92
d4af723
9895fa7
41ec54b
e002d92
 
41ec54b
e002d92
9895fa7
 
41ec54b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pysrt
import gradio as gr
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm

# Fetch and parse language options from the provided URL
url = "https://huggingface.co/Lenylvt/LanguageISO/resolve/main/iso.md"
df = pd.read_csv(url, delimiter="|", skiprows=2, header=None).dropna(axis=1, how='all')
df.columns = ['ISO 639-1', 'ISO 639-2', 'Language Name', 'Native Name']
df['ISO 639-1'] = df['ISO 639-1'].str.strip()

# Prepare language options for the dropdown
language_options = [(row['ISO 639-1'], f"{row['ISO 639-1']}") for index, row in df.iterrows()]

def translate_text(text, source_language_code, target_language_code):
    # Construct model name using ISO 639-1 codes
    model_name = f"Helsinki-NLP/opus-mt-{source_language_code}-{target_language_code}"

    # Check if source and target languages are the same, which is not supported for translation
    if source_language_code == target_language_code:
        return "Translation between the same languages is not supported."

    # Load tokenizer and model
    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
    except Exception as e:
        return f"Failed to load model for {source_language_code} to {target_language_code}: {str(e)}"

    # Translate text
    translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512))
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    
    return translated_text

def translate_srt(input_file, source_language_code, target_language_code):
    # Load SRT file
    subs = pysrt.open(input_file)

    # Initialize an empty list to store translated subtitles
    translated_subs = []

    # Translate each subtitle
    for sub in tqdm(subs, desc="Translating"):
        translated_text = translate_text(sub.text, source_language_code, target_language_code)
        translated_subs.append(translated_text)

    return "\n".join(translated_subs)

source_language_dropdown = gr.Dropdown(choices=language_options, label="Source Language")
target_language_dropdown = gr.Dropdown(choices=language_options, label="Target Language")
file_input = gr.inputs.File(label="Upload SRT File", type="text")

iface = gr.Interface(
    fn=translate_srt,
    inputs=[file_input, source_language_dropdown, target_language_dropdown],
    outputs=gr.Textbox(label="Translated SRT"),
    title="SRT Translator",
    description="Translate subtitles from one language to another."
)

iface.launch()