File size: 2,883 Bytes
bd0f7dc
 
 
c5e37aa
895c285
 
11bbd27
895c285
 
f3d47d3
c5e37aa
 
 
895c285
bd0f7dc
 
 
 
c5e37aa
bd0f7dc
895c285
 
bd0f7dc
895c285
bd0f7dc
 
 
895c285
c5e37aa
895c285
c5e37aa
895c285
 
 
 
c5e37aa
f3d47d3
895c285
c5e37aa
895c285
c5e37aa
895c285
 
 
 
 
 
 
 
 
 
 
c5e37aa
895c285
c5e37aa
895c285
 
c5e37aa
 
 
 
 
 
 
 
 
895c285
 
c5e37aa
895c285
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import gradio as gr
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ImageNet normalization values
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    """Build preprocessing pipeline for images."""
    transform = T.Compose([
        T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
        T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    ])
    return transform

def preprocess_image(image, input_size=448):
    """Preprocess the image to the required format."""
    transform = build_transform(input_size)
    tensor_image = transform(image).unsqueeze(0).to(torch.float32 if device == "cpu" else torch.bfloat16).to(device)
    return tensor_image

# Load the model and tokenizer
logging.info("Loading model from Hugging Face Hub...")
model_path = "OpenGVLab/InternVL2_5-1B"
model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True,
).to(device).eval()

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)

# Add the `<image>` token if missing
if "<image>" not in tokenizer.get_vocab():
    tokenizer.add_tokens(["<image>"])
    model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings

assert "<image>" in tokenizer.get_vocab(), "Error: `<image>` token is missing from tokenizer vocabulary."

def describe_image(image):
    """Generate a description for the uploaded image."""
    try:
        pixel_values = preprocess_image(image, input_size=448)
        prompt = "<image>\nExtract text from the image, respond with only the extracted text."

        response = model.chat(
            tokenizer=tokenizer,
            pixel_values=pixel_values,
            question=prompt,
            history=None,
            return_history=False,
            generation_config=dict(max_new_tokens=512, do_sample=True)
        )
        return response
    except Exception as e:
        logging.error(f"Error during processing: {e}")
        return f"Error: {e}"

# Gradio Interface
interface = gr.Interface(
    fn=describe_image,
    inputs=gr.Image(type="pil"),
    outputs=gr.Textbox(label="Extracted Text", lines=10, interactive=False),
    title="Image to Text",
    description="Upload an image to extract text using the pretrained model.",
)

if __name__ == "__main__":
    interface.launch(server_name="0.0.0.0", server_port=7860)