import gradio as gr
import torch
import tempfile
import os
import requests
from moviepy import VideoFileClip
from transformers import pipeline
import torchaudio
from speechbrain.pretrained.interfaces import foreign_class
from transformers import WhisperForConditionalGeneration, WhisperProcessor

# Load Whisper model to confirm English
whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device="cpu")

# Loading accent classifier
classifier = foreign_class(source="Jzuluaga/accent-id-commonaccent_xlsr-en-english", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")

# these are for fallback in case transformer's whisper-tiny doesn't return language
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")


ACCENT_LABELS = {
    "us": "American Accent",
    "england": "British Accent",
    "australia": "Australian Accent",
    "indian": "Indian Accent",
    "canada": "Canadian Accent",
    "bermuda": "Bermudian Accent",
    "scotland": "Scottish Accent",
    "african": "African Accent",
    "ireland": "Irish Accent",
    "newzealand": "New Zealand Accent",
    "wales": "Welsh Accent",
    "malaysia": "Malaysian Accent",
    "philippines": "Philippine Accent",
    "singapore": "Singaporean Accent",
    "hongkong": "Hong Kong Accent",
    "southatlandtic": "South Atlantic Accent"
}


def classify_accent(audio_tensor, sample_rate):
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        audio_tensor = resampler(audio_tensor)

    out_prob, score, index, text_lab = classifier.classify_batch(audio_tensor)

    print(out_prob, score, index, text_lab)

    accent_label = text_lab[0]
    readable_accent = ACCENT_LABELS.get(accent_label, accent_label.title() + " Accent")


    return {
        "accent": readable_accent,
        "confidence": round(score[0].item() * 100, 2),
        "summary": f"The speaker is predicted to have a {readable_accent} with {round(score[0].item() * 100, 2)}% confidence."
    }

def download_video(url):
    video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
    response = requests.get(url, stream=True)
    with open(video_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=1024*1024):
            if chunk:
                f.write(chunk)
    return video_path

def extract_audio(video_path):
    audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
    clip = VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_path, codec='pcm_s16le')
    return audio_path


def detect_language(audio_path):
    audio, sr = torchaudio.load(audio_path)
    inputs = processor(audio[0], sampling_rate=sr, return_tensors="pt")
    logits = model.forward(**inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    decoded = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
    return decoded[0]  # crude approximation


def transcribe(audio_path):
    result = whisper_pipe(audio_path, return_language=True)
    print(result)

    lang = result['chunks'][0]['language']

    if lang == None:
        lang = detect_language(audio_path)

    return result['text'], lang

def analyze_accent(url_or_file):
    try:
        print("Video path 1:", url_or_file)
        if url_or_file.startswith("http"):
            video_path = download_video(url_or_file)
        else:
            video_path = url_or_file

        print("Video path:", video_path)

        audio_path = extract_audio(video_path)


        print("Audio path:", audio_path)


        # Load audio with torchaudio
        waveform, sample_rate = torchaudio.load(audio_path)

        # Transcription (to verify English)
        transcript = transcribe(audio_path)
        if len(transcript[0].strip()) < 3:
            return "Could not understand speech. Please try another video."

        print("Transcript:", transcript)

        # Accent classification
        result = classify_accent(waveform, sample_rate)

        output = f"**Language**: {transcript[1]}\n\n"

        if transcript[1].lower() != "en" and transcript[1].lower() != "english":
            return "The video is not in English. Please provide an English video."

        output += f"**Accent**: {result['accent']}\n\n"
        output += f"**Confidence**: {result['confidence']}%\n\n"
        output += f"**Explanation**: {result['summary']}\n\n"
        output += f"**Transcript** (first 200 chars): {transcript[0][:200]}..."

        # Clean up temp files
        if url_or_file.startswith("http"):
            os.remove(video_path)

        os.remove(audio_path)

        return output
    except Exception as e:
        return f"❌ Error: {str(e)}"


with gr.Blocks() as demo:

    gr.Markdown("""
        # English Accent Classifier!

        ### How it works?
        - Takes video URL or video file  
        - Converts it into audio  
        - Uses `Whisper-tiny` to detect which language is being spoken  
        - If the detected language is English, it uses SpeechBrain's Accent ID classifier to show the speaker's accent along with a confidence score.

        **Q: What if my transformers version doesn't expose `return_language` for `whisper-tiny`?**  
        A: Then it will approximate the language by counting which language's tokens it is using the most.
        """)

    with gr.Tab("From URL"):
        url_input = gr.Textbox(label="Video URL (MP4)")
        url_output = gr.Markdown("""### Output will be shown here!""", elem_classes="output-box")
        gr.Button("Analyze").click(fn=analyze_accent, inputs=url_input, outputs=url_output)


        gr.Examples(
            examples=[["https://huggingface.co/spaces/fahadqazi/accent-classifier/resolve/main/examples/american.mp4"], ["https://huggingface.co/spaces/fahadqazi/accent-classifier/resolve/main/examples/british.mp4"]],
            inputs=[url_input],
            outputs=[url_output],
            label="Example MP4 Video URLs",
            examples_per_page=5
        )


    with gr.Tab("From File"):
        file_input = gr.File(label="Upload MP4 Video", file_types=[".mp4"])
        file_output = gr.Markdown("""### Output will be shown here!""", elem_classes="output-box")
        gr.Button("Analyze").click(fn=analyze_accent, inputs=file_input, outputs=file_output)


        gr.Examples(
            examples=[[os.getcwd() + "/examples/american.mp4"], [os.getcwd() + "/examples/british.mp4"]],
            inputs=[file_input],
            outputs=[file_output],
            label="Example MP4 Videos",
            examples_per_page=5
        )


demo.css = """
.output-box {
    min-height: 70px;
    overflow-y: auto;
    padding: 10px;
}
"""


demo.launch()