from transformers import BitsAndBytesConfig import torch MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" # FP16 weights bnb_cfg = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, # keeps mat-mul fast ) tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", trust_remote_code=True, quantization_config=bnb_cfg, # perfectly fine here ) prompt_tpl = ( "Summarise the following transcript in short in 1 or 2 paragraph and point wise and don't miss any key information cover all" ) gen = pipeline("text-generation", model=model, tokenizer=tok, max_new_tokens=256, temperature=0.3) MAX_CHUNK = 6_000 # ≈ 4 k tokens def summarize(txt: str) -> str: parts = textwrap.wrap(txt, MAX_CHUNK, break_long_words=False) partials = [ gen(prompt_tpl.format(chunk=p))[0]["generated_text"] .split("### Summary:")[-1].strip() for p in parts ] return gen(prompt_tpl.format(chunk=" ".join(partials)))[0]["generated_text"]\ .split("### Summary:")[-1].strip() demo = gr.Interface(fn=summarize, inputs=gr.Textbox(lines=20, label="Transcript"), outputs="text", title="Free Transcript Summariser – Mixtral-8×7B") if __name__ == "__main__": demo.launch()