from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch


class OCR:
    def __init__(self, device="cpu"):
        self.device = torch.device(device)
        self.model = AutoModelForImageTextToText.from_pretrained(
            "google/gemma-3-12b-it",
            torch_dtype=torch.bfloat16,
        ).to(self.device)
        self.processor = AutoProcessor.from_pretrained("google/gemma-3-12b-it")

        self.messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {
                        "type": "text",
                        "text": "Extract and output only the text from the image in its original language. If there is no text, return nothing.",
                    },
                ],
            },
        ]

    def predict(self, image):
        image = (
            (image * 255).clamp(0, 255).to(torch.uint8).permute((1, 2, 0)).cpu().numpy()
        )
        image = Image.fromarray(image).convert("RGB").resize((1024, 1024))
        prompt = self.processor.apply_chat_template(
            self.messages, add_generation_prompt=True
        )
        inputs = self.processor(text=prompt, images=[image], return_tensors="pt").to(
            self.device
        )
        with torch.no_grad():
            generated_ids = self.model.generate(**inputs, max_new_tokens=1024)
        generated_text = self.processor.batch_decode(
            generated_ids[:, inputs.input_ids.shape[-1] :], skip_special_tokens=True
        )[0]
        return generated_text