import re | |
import torch | |
import requests | |
from PIL import Image, ImageDraw | |
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration | |
repo = "microsoft/kosmos-2.5" | |
device = "cuda:0" | |
dtype = torch.bfloat16 | |
model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype) | |
processor = AutoProcessor.from_pretrained(repo) | |
# sample image | |
url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png" | |
image = Image.open(requests.get(url, stream=True).raw) | |
prompt = "<md>" | |
inputs = processor(text=prompt, images=image, return_tensors="pt") | |
height, width = inputs.pop("height"), inputs.pop("width") | |
raw_width, raw_height = image.size | |
scale_height = raw_height / height | |
scale_width = raw_width / width | |
inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()} | |
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype) | |
generated_ids = model.generate( | |
**inputs, | |
max_new_tokens=1024, | |
) | |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) | |
print(generated_text[0]) |