import gradio as gr
import torch
import numpy as np
import librosa
from transformers import pipeline

# --------------------------------------------------
# ASR Pipeline (for English transcription)
# --------------------------------------------------
asr = pipeline(
    "automatic-speech-recognition",
    model="facebook/wav2vec2-large-960h-lv60-self"
)

# --------------------------------------------------
# Mapping for Target Languages and Models
# --------------------------------------------------
translation_models = {
    "Spanish": "Helsinki-NLP/opus-mt-en-es",
    "French": "Helsinki-NLP/opus-mt-en-fr",
    "German": "Helsinki-NLP/opus-mt-en-de",
    "Chinese": "Helsinki-NLP/opus-mt-en-zh",
    "Russian": "Helsinki-NLP/opus-mt-en-ru",
    "Arabic": "Helsinki-NLP/opus-mt-en-ar",
    "Portuguese": "Helsinki-NLP/opus-mt-en-pt",
    "Japanese": "Helsinki-NLP/opus-mt-en-ja",
    "Italian": "Helsinki-NLP/opus-mt-en-it",
    "Korean": "Helsinki-NLP/opus-mt-en-ko"
}

# Each language often requires a specific pipeline task name 
# (e.g., "translation_en_to_zh" rather than "translation_en_to_chinese")
translation_tasks = {
    "Spanish": "translation_en_to_es",
    "French": "translation_en_to_fr",
    "German": "translation_en_to_de",
    "Chinese": "translation_en_to_zh",
    "Russian": "translation_en_to_ru",
    "Arabic": "translation_en_to_ar",
    "Portuguese": "translation_en_to_pt",
    "Japanese": "translation_en_to_ja",
    "Italian": "translation_en_to_it",
    "Korean": "translation_en_to_ko"
}

# TTS models (some may not exist or may be unofficial)
tts_models = {
    "Spanish": "tts_models/es/tacotron2-DDC",
    "French": "tts_models/fr/tacotron2",
    "German": "tts_models/de/tacotron2",
    "Chinese": "tts_models/zh/tacotron2",     # Verify if this actually exists on Hugging Face
    "Russian": "tts_models/ru/tacotron2",     # Same note
    "Arabic": "tts_models/ar/tacotron2",      # Same note
    "Portuguese": "tts_models/pt/tacotron2",  # Same note
    "Japanese": "tts_models/ja/tacotron2",    # Same note
    "Italian": "tts_models/it/tacotron2",     # Same note
    "Korean": "tts_models/ko/tacotron2"       # Same note
}

# --------------------------------------------------
# Caches for translator and TTS pipelines
# --------------------------------------------------
translator_cache = {}
tts_cache = {}

def get_translator(target_language):
    """
    Retrieve or create a translation pipeline for the specified language.
    """
    if target_language in translator_cache:
        return translator_cache[target_language]
    
    model_name = translation_models[target_language]
    task_name = translation_tasks[target_language]
    
    translator = pipeline(task_name, model=model_name)
    translator_cache[target_language] = translator
    return translator

def get_tts(target_language):
    """
    Retrieve or create a TTS pipeline for the specified language, if available.
    """
    if target_language in tts_cache:
        return tts_cache[target_language]
    
    model_name = tts_models.get(target_language)
    if model_name is None:
        # If no TTS model is mapped, raise an error or handle gracefully
        raise ValueError(f"No TTS model available for {target_language}.")

    try:
        tts_pipeline = pipeline("text-to-speech", model=model_name)
    except Exception as e:
        raise ValueError(
            f"Failed to load TTS model for {target_language}. "
            f"Make sure '{model_name}' exists on Hugging Face.\nError: {e}"
        )

    tts_cache[target_language] = tts_pipeline
    return tts_pipeline

# --------------------------------------------------
# Prediction Function
# --------------------------------------------------
def predict(audio, text, target_language):
    """
    1. Obtain English text (from text input or ASR).
    2. Translate English -> target_language.
    3. Synthesize speech in target_language.
    """
    # 1. English text from text input (if provided), else from audio via ASR
    if text.strip():
        english_text = text.strip()
    elif audio is not None:
        sample_rate, audio_data = audio

        # Ensure the audio is float32 for librosa
        if audio_data.dtype not in [np.float32, np.float64]:
            audio_data = audio_data.astype(np.float32)

        # Convert stereo to mono if needed
        if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
            audio_data = np.mean(audio_data, axis=1)

        # Resample to 16 kHz if necessary
        if sample_rate != 16000:
            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)

        input_audio = {"array": audio_data, "sampling_rate": 16000}
        asr_result = asr(input_audio)
        english_text = asr_result["text"]
    else:
        return "No input provided.", "", None

    # 2. Translation step
    translator = get_translator(target_language)
    try:
        translation_result = translator(english_text)
        translated_text = translation_result[0]["translation_text"]
    except Exception as e:
        # If there's an error in translation, return partial results
        return english_text, f"Translation error: {e}", None

    # 3. TTS step: synthesize speech from the translated text
    try:
        tts_pipeline = get_tts(target_language)
        tts_result = tts_pipeline(translated_text)
        # The TTS pipeline returns a dict with "wav" and "sample_rate"
        synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
    except Exception as e:
        # If TTS fails, return partial results
        return english_text, translated_text, f"TTS error: {e}"

    return english_text, translated_text, synthesized_audio

# --------------------------------------------------
# Gradio Interface Setup
# --------------------------------------------------
iface = gr.Interface(
    fn=predict,
    inputs=[
        gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
        gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
        gr.Dropdown(choices=list(translation_models.keys()), value="Spanish", label="Target Language")
    ],
    outputs=[
        gr.Textbox(label="English Transcription"),
        gr.Textbox(label="Translation (Target Language)"),
        gr.Audio(label="Synthesized Speech in Target Language")
    ],
    title="Multimodal Language Learning Aid",
    description=(
        "This app helps language learners by providing three outputs:\n"
        "1. English transcription (from ASR or text input),\n"
        "2. Translation to a target language (using Helsinki-NLP models), and\n"
        "3. Synthetic speech in the target language.\n\n"
        "Select one of the top 10 commonly used languages from the dropdown.\n"
        "Either record/upload an English audio sample or enter English text directly.\n\n"
        "Note: Some TTS models may not exist or be unstable for certain languages."
    ),
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch()