import os, textwrap, torch, gradio as gr
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

# ✅ 1.  Use the *Instruct* checkpoint
MODEL_ID = os.getenv(
    "MODEL_ID",
    "mistralai/Mixtral-8x7B-Instruct-v0.3"       # correct model name
)

# ✅ 2.  Load in 4-bit so it fits on Hugging-Face ZeroGPU (<15 GB)
bnb_cfg = BitsAndBytesConfig(load_in_4bit=True)

tok = AutoTokenizer.from_pretrained(MODEL_ID)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_cfg,          # 4-bit
    device_map="auto",
    torch_dtype=torch.float16,            # ZeroGPU has a single T4/L4
    trust_remote_code=True,               # required for Mixtral
)

# ✅ 3.  Use *text-generation* with an explicit prompt template
prompt_tmpl = (
    "Summarise the following transcript in short in 1 or 2 paragraph and point wise and don't miss any key information cover all"
)

gen = pipeline(
    "text-generation",
    model=model,
    tokenizer=tok,
    max_new_tokens=256,
    temperature=0.1,
)

MAX_CHUNK = 6_000  # ≈4 k tokens

def summarize(txt):
    parts = textwrap.wrap(txt, MAX_CHUNK, break_long_words=False)
    partials = [
        gen(prompt_tmpl.format(chunk=p))[0]["generated_text"].split("### Summary:")[-1].strip()
        for p in parts
    ]
    return gen(prompt_tmpl.format(chunk=" ".join(partials)))[0]["generated_text"]\
             .split("### Summary:")[-1].strip()

demo = gr.Interface(
    fn=summarize,
    inputs=gr.Textbox(lines=20, label="Transcript"),
    outputs="text",
    title="Mixtral-8×7B Transcript Summariser",
)

if __name__ == "__main__":
    demo.launch()