InternVL2.5 / app.py
SinghAbhinav04's picture
Update app.py
b920dbb verified
import os
import torch
import gradio as gr
import numpy as np
from PIL import Image
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoTokenizer, AutoModel
from decord import VideoReader, cpu
import tempfile
import json
from typing import List, Tuple, Optional, Union
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Constants
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
MODEL_PATH = "OpenGVLab/InternVL2_5-4B"
class InternVLChatBot:
def __init__(self):
self.model = None
self.tokenizer = None
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.generation_config = dict(max_new_tokens=1024, do_sample=True)
self.load_model()
def load_model(self):
"""Load the InternVL model and tokenizer"""
try:
logger.info("Loading InternVL2.5-4B model...")
self.model = AutoModel.from_pretrained(
MODEL_PATH,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True,
use_flash_attn=False,
device_map="auto" if self.device == "cuda" else None
)
self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_PATH, trust_remote_code=True
)
logger.info("Model loaded successfully!")
except Exception as e:
logger.error(f"Error loading model: {str(e)}")
raise e
def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
"""Find the closest aspect ratio from target ratios"""
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
"""Dynamically preprocess image based on aspect ratio"""
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# Calculate target ratios
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# Find closest aspect ratio
target_aspect_ratio = self.find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
# Calculate target dimensions
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# Resize and split image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
split_img = resized_img.crop(box)
processed_images.append(split_img)
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def build_transform(self, input_size):
"""Build image transformation pipeline"""
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])
return transform
def load_image(self, image_path, input_size=448, max_num=12):
"""Load and preprocess image"""
if isinstance(image_path, str):
image = Image.open(image_path).convert('RGB')
else:
image = image_path.convert('RGB')
transform = self.build_transform(input_size=input_size)
images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(img) for img in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def get_index(self, bound, fps, max_frame, first_idx=0, num_segments=32):
"""Get frame indices for video processing"""
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / num_segments
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(num_segments)
])
return frame_indices
def load_video(self, video_path, bound=None, input_size=448, max_num=1, num_segments=32):
"""Load and preprocess video"""
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
pixel_values_list, num_patches_list = [], []
transform = self.build_transform(input_size=input_size)
frame_indices = self.get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
img = self.dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(tile) for tile in img]
pixel_values = torch.stack(pixel_values)
num_patches_list.append(pixel_values.shape[0])
pixel_values_list.append(pixel_values)
pixel_values = torch.cat(pixel_values_list)
return pixel_values, num_patches_list
def chat(self, message, history, image=None, video=None):
"""Main chat function"""
try:
pixel_values = None
num_patches_list = None
# Process image if provided
if image is not None:
pixel_values = self.load_image(image, max_num=12)
if self.device == "cuda":
pixel_values = pixel_values.to(torch.bfloat16).cuda()
message = f"<image>\n{message}"
# Process video if provided
elif video is not None:
pixel_values, num_patches_list = self.load_video(video, num_segments=8, max_num=1)
if self.device == "cuda":
pixel_values = pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
message = f"{video_prefix}{message}"
# Convert history to the expected format
chat_history = []
if history:
for item in history:
if isinstance(item, dict) and "role" in item:
if item["role"] == "user":
last_user = item["content"]
elif item["role"] == "assistant":
chat_history.append((last_user, item["content"]))
# Generate response
if num_patches_list is not None:
response, new_history = self.model.chat(
self.tokenizer,
pixel_values,
message,
self.generation_config,
num_patches_list=num_patches_list,
history=chat_history,
return_history=True
)
else:
response, new_history = self.model.chat(
self.tokenizer,
pixel_values,
message,
self.generation_config,
history=chat_history,
return_history=True
)
# Update history
history.append({"role": "user", "content": message})
history.append({"role": "assistant", "content": response})
return "", history, None, None
except Exception as e:
logger.error(f"Error in chat: {str(e)}")
error_msg = f"Sorry, I encountered an error: {str(e)}"
history.append([message, error_msg])
return "", history, None, None
# Initialize the chatbot
chatbot = InternVLChatBot()
# Create Gradio interface
def create_interface():
"""Create the Gradio interface"""
# Custom CSS for better styling
custom_css = """
.gradio-container {
font-family: 'Arial', sans-serif;
}
.chat-message {
padding: 10px;
margin: 5px 0;
border-radius: 10px;
}
.user-message {
background-color: #e3f2fd;
margin-left: 20px;
}
.bot-message {
background-color: #f5f5f5;
margin-right: 20px;
}
"""
with gr.Blocks(css=custom_css, title="InternVL2.5-4B Chat") as interface:
gr.Markdown("""
# 🤖 InternVL2.5-4B Multimodal Chat
Welcome to the InternVL2.5-4B chat interface! This AI assistant can:
- 💬 Have conversations with text
- 🖼️ Analyze and describe images
- 🎥 Process and understand videos
- 📝 Extract text from images (OCR)
- 🎯 Answer questions about visual content
**Instructions:**
1. Type your message in the text box
2. Optionally upload an image or video
3. Click Send to get a response
4. Use "Clear" to reset the conversation
""")
with gr.Row():
with gr.Column(scale=3):
chatbot_interface = gr.Chatbot(
label="Chat History",
height=500,
show_copy_button=True,
avatar_images=["👤", "🤖"],
type="messages"
)
with gr.Row():
msg = gr.Textbox(
label="Your Message",
placeholder="Type your message here... You can ask about images, videos, or just chat!",
lines=2,
scale=4
)
send_btn = gr.Button("Send 📤", scale=1, variant="primary")
with gr.Row():
clear_btn = gr.Button("Clear 🗑️", scale=1)
with gr.Column(scale=1):
gr.Markdown("### 📎 Upload Media")
image_input = gr.Image(
label="Upload Image",
type="pil",
height=200
)
video_input = gr.Video(
label="Upload Video",
height=200
)
gr.Markdown("""
**Supported formats:**
- Images: JPG, PNG, WEBP, GIF
- Videos: MP4, AVI, MOV, WEBM
**Tips:**
- For images: Ask about content, extract text, or describe what you see
- For videos: Ask for descriptions, analysis, or specific details
- You can upload one media file at a time
""")
# Example prompts
gr.Markdown("### 💡 Example Prompts")
with gr.Row():
example_btn1 = gr.Button("👋 Hello, introduce yourself")
example_btn2 = gr.Button("🖼️ Describe this image")
example_btn3 = gr.Button("📝 Extract text from image")
example_btn4 = gr.Button("🎥 Analyze this video")
# Event handlers
def submit_message(message, history, image, video):
if not message.strip():
return "", history, image, video
return chatbot.chat(message, history, image, video)
def clear_chat():
return [], None, None
def set_example_prompt(prompt):
return prompt
# Wire up the interface
send_btn.click(
fn=submit_message,
inputs=[msg, chatbot_interface, image_input, video_input],
outputs=[msg, chatbot_interface, image_input, video_input]
)
msg.submit(
fn=submit_message,
inputs=[msg, chatbot_interface, image_input, video_input],
outputs=[msg, chatbot_interface, image_input, video_input]
)
clear_btn.click(
fn=clear_chat,
outputs=[chatbot_interface, image_input, video_input]
)
# Example button handlers
example_btn1.click(
fn=set_example_prompt,
inputs=[gr.State("Hello, who are you?")],
outputs=[msg]
)
example_btn2.click(
fn=set_example_prompt,
inputs=[gr.State("Please describe this image in detail.")],
outputs=[msg]
)
example_btn3.click(
fn=set_example_prompt,
inputs=[gr.State("Extract the exact text provided in the image.")],
outputs=[msg]
)
example_btn4.click(
fn=set_example_prompt,
inputs=[gr.State("Describe this video in detail.")],
outputs=[msg]
)
# Footer
gr.Markdown("""
---
**About InternVL2.5-4B:** A powerful multimodal AI model developed by Shanghai AI Lab, Tsinghua University and partners.
**API Usage:** This interface supports API calls. The chat endpoint accepts JSON with `message`, `image`, and `video` fields.
""")
return interface
# API endpoint for external integrations
def api_chat(message: str, image: Optional[str] = None, video: Optional[str] = None, history: Optional[List] = None):
"""
API endpoint for chat functionality
Args:
message: Text message
image: Base64 encoded image or image path
video: Video file path
history: Chat history as list of [user_msg, bot_msg] pairs
Returns:
Dictionary with response and updated history
"""
try:
if history is None:
history = []
# Process image if provided (handle base64 or file path)
image_obj = None
if image:
try:
if image.startswith('data:image'):
# Handle base64 image
import base64
from io import BytesIO
image_data = image.split(',')[1]
image_bytes = base64.b64decode(image_data)
image_obj = Image.open(BytesIO(image_bytes))
else:
# Handle file path
image_obj = Image.open(image)
except Exception as e:
logger.error(f"Error processing image: {str(e)}")
# Chat with the model
_, updated_history, _, _ = chatbot.chat(message, history, image_obj, video)
return {
"response": updated_history[-1][1] if updated_history else "",
"history": updated_history,
"status": "success"
}
except Exception as e:
logger.error(f"API Error: {str(e)}")
return {
"response": f"Error: {str(e)}",
"history": history,
"status": "error"
}
if __name__ == "__main__":
# Create and launch the interface
interface = create_interface()
# Launch with API access enabled
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
show_api=True,
)