# evo_plugin_example.py — FLAN-T5 stand-in (truncation + clean kwargs)
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class _HFSeq2SeqGenerator:
    def __init__(self, model_name: str = "google/flan-t5-small"):
        self.device = torch.device("cpu")
        self.tok = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device).eval()
        # FLAN-T5 encoder max length
        ml = getattr(self.tok, "model_max_length", 512) or 512
        # Some tokenizers report a huge sentinel value; clamp to 512 for T5-small
        self.max_src_len = min(512, int(ml if ml < 10000 else 512))

    @torch.no_grad()
    def generate(self, prompt: str, max_new_tokens: int = 200, temperature: float = 0.0) -> str:
        # TRUNCATE input to model's max encoder length
        inputs = self.tok(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_src_len,
        ).to(self.device)

        do_sample = float(temperature) > 0.0

        gen_kwargs = dict(
            max_new_tokens=int(max_new_tokens),
            num_beams=4,                 # stable, less echo
            early_stopping=True,
            no_repeat_ngram_size=3,
            repetition_penalty=1.1,
            length_penalty=0.1,
        )
        # Only include sampling args when sampling is ON (silences warnings)
        if do_sample:
            gen_kwargs.update(
                do_sample=True,
                temperature=float(max(0.01, temperature)),
                top_p=0.9,
            )

        # Encourage non-trivial length without tying to input length
        gen_kwargs["min_new_tokens"] = max(48, int(0.4 * max_new_tokens))

        out = self.model.generate(**inputs, **gen_kwargs)
        return self.tok.decode(out[0], skip_special_tokens=True).strip()

def load_model():
    return _HFSeq2SeqGenerator()