Model Card for letxbe/qwen2-7b-BoundingDocs-rephrased

letxbe/qwen2-7b-BoundingDocs-rephrased is a fine-tuned Qwen2-VL-7B for the Document Question Answering task. It was trained on BoundingDocs using the rephrased version of the questions.

Model Details

Model Description

Developed by: LetXBe
Model Type: Vision LLM
Languages: Multilingual
License: CC BY 4.0
Finetuned From: Qwen2-VL-7B
Input Format: Question + document image
Output Format: JSON

🚀 How to Use

The model should be prompted in the manner explained in the Qwen2-VL-7B model card, available here.

Inference Example

from transformers import AutoProcessor, AutoModelForImageTextToText
from qwen_vl_utils import process_vision_info
from PIL import Image
import torch
from transformers import BitsAndBytesConfig


def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(
        sample[0:2], tokenize=False, add_generation_prompt=True
    )

    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(sample)

    # Prepare the inputs for the model
    model_inputs = processor(
        text=[text_input],
        images=image_inputs,
        return_tensors="pt",
    ).to(
        device
    )  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text


min_pixels = 256*28*28
max_pixels = 512*28*28
processor = AutoProcessor.from_pretrained('Qwen/Qwen2-VL-7B-Instruct', min_pixels=min_pixels, max_pixels=max_pixels, use_fast=True)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForImageTextToText.from_pretrained(
    "letxbe/qwen2-7b-BoundingDocs-rephrased",
    device_map="cuda",
    quantization_config=bnb_config
)

system_message = """You are a Vision Language Model specialized in extracting information from document images.  
Your task is to analyze the provided document image and extract relevant information accurately.  
Documents may contain text, tables, forms, and structured or unstructured data.  
Ensure responses are precise and concise, without additional explanations unless required for clarity."""  

TEMPLATE_PROMPT = """
<starttask>
Answer the following question about the document:
Question: "{QUESTION}"
Answer completing the following format:
'''json
{{"value": ""}}
'''
<endtask>
"""

question = "question about the document"

prompt = TEMPLATE_PROMPT.format(QUESTION=question)
  
message = [
            # system message
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            # question
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": Image.new("RGB", (512, 512), (255, 255, 255)),
                    },
                    {
                        "type": "text",
                        "text": prompt,
                    },
                ],
            }
]

output = generate_text_from_sample(model, processor, message)

print(output)