Spaces:

SinghAbhinav04
/

InternVL2.5

Sleeping

App Files Files Community

InternVL2.5 / app.py

SinghAbhinav04

Update app.py

b920dbb verified 30 days ago

raw

history blame contribute delete

17.1 kB

	import os
	import torch
	import gradio as gr
	import numpy as np
	from PIL import Image
	import torchvision.transforms as T
	from torchvision.transforms.functional import InterpolationMode
	from transformers import AutoTokenizer, AutoModel
	from decord import VideoReader, cpu
	import tempfile
	import json
	from typing import List, Tuple, Optional, Union
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Constants
	IMAGENET_MEAN = (0.485, 0.456, 0.406)
	IMAGENET_STD = (0.229, 0.224, 0.225)
	MODEL_PATH = "OpenGVLab/InternVL2_5-4B"

	class InternVLChatBot:
	def __init__(self):
	self.model = None
	self.tokenizer = None
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.generation_config = dict(max_new_tokens=1024, do_sample=True)
	self.load_model()

	def load_model(self):
	"""Load the InternVL model and tokenizer"""
	try:
	logger.info("Loading InternVL2.5-4B model...")
	self.model = AutoModel.from_pretrained(
	MODEL_PATH,
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	use_flash_attn=False,
	device_map="auto" if self.device == "cuda" else None
	)
	self.tokenizer = AutoTokenizer.from_pretrained(
	MODEL_PATH, trust_remote_code=True
	)
	logger.info("Model loaded successfully!")
	except Exception as e:
	logger.error(f"Error loading model: {str(e)}")
	raise e

	def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
	"""Find the closest aspect ratio from target ratios"""
	best_ratio_diff = float('inf')
	best_ratio = (1, 1)
	area = width * height

	for ratio in target_ratios:
	target_aspect_ratio = ratio[0] / ratio[1]
	ratio_diff = abs(aspect_ratio - target_aspect_ratio)
	if ratio_diff < best_ratio_diff:
	best_ratio_diff = ratio_diff
	best_ratio = ratio
	elif ratio_diff == best_ratio_diff:
	if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
	best_ratio = ratio
	return best_ratio

	def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
	"""Dynamically preprocess image based on aspect ratio"""
	orig_width, orig_height = image.size
	aspect_ratio = orig_width / orig_height

	# Calculate target ratios
	target_ratios = set(
	(i, j) for n in range(min_num, max_num + 1)
	for i in range(1, n + 1)
	for j in range(1, n + 1)
	if i * j <= max_num and i * j >= min_num
	)
	target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

	# Find closest aspect ratio
	target_aspect_ratio = self.find_closest_aspect_ratio(
	aspect_ratio, target_ratios, orig_width, orig_height, image_size
	)

	# Calculate target dimensions
	target_width = image_size * target_aspect_ratio[0]
	target_height = image_size * target_aspect_ratio[1]
	blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

	# Resize and split image
	resized_img = image.resize((target_width, target_height))
	processed_images = []

	for i in range(blocks):
	box = (
	(i % (target_width // image_size)) * image_size,
	(i // (target_width // image_size)) * image_size,
	((i % (target_width // image_size)) + 1) * image_size,
	((i // (target_width // image_size)) + 1) * image_size
	)
	split_img = resized_img.crop(box)
	processed_images.append(split_img)

	if use_thumbnail and len(processed_images) != 1:
	thumbnail_img = image.resize((image_size, image_size))
	processed_images.append(thumbnail_img)

	return processed_images

	def build_transform(self, input_size):
	"""Build image transformation pipeline"""
	transform = T.Compose([
	T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
	T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
	T.ToTensor(),
	T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
	])
	return transform

	def load_image(self, image_path, input_size=448, max_num=12):
	"""Load and preprocess image"""
	if isinstance(image_path, str):
	image = Image.open(image_path).convert('RGB')
	else:
	image = image_path.convert('RGB')

	transform = self.build_transform(input_size=input_size)
	images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
	pixel_values = [transform(img) for img in images]
	pixel_values = torch.stack(pixel_values)
	return pixel_values

	def get_index(self, bound, fps, max_frame, first_idx=0, num_segments=32):
	"""Get frame indices for video processing"""
	if bound:
	start, end = bound[0], bound[1]
	else:
	start, end = -100000, 100000

	start_idx = max(first_idx, round(start * fps))
	end_idx = min(round(end * fps), max_frame)
	seg_size = float(end_idx - start_idx) / num_segments

	frame_indices = np.array([
	int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
	for idx in range(num_segments)
	])
	return frame_indices

	def load_video(self, video_path, bound=None, input_size=448, max_num=1, num_segments=32):
	"""Load and preprocess video"""
	vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
	max_frame = len(vr) - 1
	fps = float(vr.get_avg_fps())

	pixel_values_list, num_patches_list = [], []
	transform = self.build_transform(input_size=input_size)
	frame_indices = self.get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)

	for frame_index in frame_indices:
	img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
	img = self.dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
	pixel_values = [transform(tile) for tile in img]
	pixel_values = torch.stack(pixel_values)
	num_patches_list.append(pixel_values.shape[0])
	pixel_values_list.append(pixel_values)

	pixel_values = torch.cat(pixel_values_list)
	return pixel_values, num_patches_list

	def chat(self, message, history, image=None, video=None):
	"""Main chat function"""
	try:
	pixel_values = None
	num_patches_list = None

	# Process image if provided
	if image is not None:
	pixel_values = self.load_image(image, max_num=12)
	if self.device == "cuda":
	pixel_values = pixel_values.to(torch.bfloat16).cuda()
	message = f"<image>\n{message}"

	# Process video if provided
	elif video is not None:
	pixel_values, num_patches_list = self.load_video(video, num_segments=8, max_num=1)
	if self.device == "cuda":
	pixel_values = pixel_values.to(torch.bfloat16).cuda()
	video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
	message = f"{video_prefix}{message}"

	# Convert history to the expected format
	chat_history = []
	if history:
	for item in history:
	if isinstance(item, dict) and "role" in item:
	if item["role"] == "user":
	last_user = item["content"]
	elif item["role"] == "assistant":
	chat_history.append((last_user, item["content"]))


	# Generate response
	if num_patches_list is not None:
	response, new_history = self.model.chat(
	self.tokenizer,
	pixel_values,
	message,
	self.generation_config,
	num_patches_list=num_patches_list,
	history=chat_history,
	return_history=True
	)
	else:
	response, new_history = self.model.chat(
	self.tokenizer,
	pixel_values,
	message,
	self.generation_config,
	history=chat_history,
	return_history=True
	)

	# Update history
	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content": response})
	return "", history, None, None


	except Exception as e:
	logger.error(f"Error in chat: {str(e)}")
	error_msg = f"Sorry, I encountered an error: {str(e)}"
	history.append([message, error_msg])
	return "", history, None, None

	# Initialize the chatbot
	chatbot = InternVLChatBot()

	# Create Gradio interface
	def create_interface():
	"""Create the Gradio interface"""

	# Custom CSS for better styling
	custom_css = """
	.gradio-container {
	font-family: 'Arial', sans-serif;
	}
	.chat-message {
	padding: 10px;
	margin: 5px 0;
	border-radius: 10px;
	}
	.user-message {
	background-color: #e3f2fd;
	margin-left: 20px;
	}
	.bot-message {
	background-color: #f5f5f5;
	margin-right: 20px;
	}
	"""

	with gr.Blocks(css=custom_css, title="InternVL2.5-4B Chat") as interface:
	gr.Markdown("""
	# 🤖 InternVL2.5-4B Multimodal Chat

	Welcome to the InternVL2.5-4B chat interface! This AI assistant can:
	- 💬 Have conversations with text
	- 🖼️ Analyze and describe images
	- 🎥 Process and understand videos
	- 📝 Extract text from images (OCR)
	- 🎯 Answer questions about visual content

	Instructions:
	1. Type your message in the text box
	2. Optionally upload an image or video
	3. Click Send to get a response
	4. Use "Clear" to reset the conversation
	""")

	with gr.Row():
	with gr.Column(scale=3):
	chatbot_interface = gr.Chatbot(
	label="Chat History",
	height=500,
	show_copy_button=True,
	avatar_images=["👤", "🤖"],
	type="messages"
	)

	with gr.Row():
	msg = gr.Textbox(
	label="Your Message",
	placeholder="Type your message here... You can ask about images, videos, or just chat!",
	lines=2,
	scale=4
	)
	send_btn = gr.Button("Send 📤", scale=1, variant="primary")

	with gr.Row():
	clear_btn = gr.Button("Clear 🗑️", scale=1)

	with gr.Column(scale=1):
	gr.Markdown("### 📎 Upload Media")

	image_input = gr.Image(
	label="Upload Image",
	type="pil",
	height=200
	)

	video_input = gr.Video(
	label="Upload Video",
	height=200
	)

	gr.Markdown("""
	Supported formats:
	- Images: JPG, PNG, WEBP, GIF
	- Videos: MP4, AVI, MOV, WEBM

	Tips:
	- For images: Ask about content, extract text, or describe what you see
	- For videos: Ask for descriptions, analysis, or specific details
	- You can upload one media file at a time
	""")

	# Example prompts
	gr.Markdown("### 💡 Example Prompts")
	with gr.Row():
	example_btn1 = gr.Button("👋 Hello, introduce yourself")
	example_btn2 = gr.Button("🖼️ Describe this image")
	example_btn3 = gr.Button("📝 Extract text from image")
	example_btn4 = gr.Button("🎥 Analyze this video")

	# Event handlers
	def submit_message(message, history, image, video):
	if not message.strip():
	return "", history, image, video
	return chatbot.chat(message, history, image, video)

	def clear_chat():
	return [], None, None

	def set_example_prompt(prompt):
	return prompt

	# Wire up the interface
	send_btn.click(
	fn=submit_message,
	inputs=[msg, chatbot_interface, image_input, video_input],
	outputs=[msg, chatbot_interface, image_input, video_input]
	)

	msg.submit(
	fn=submit_message,
	inputs=[msg, chatbot_interface, image_input, video_input],
	outputs=[msg, chatbot_interface, image_input, video_input]
	)

	clear_btn.click(
	fn=clear_chat,
	outputs=[chatbot_interface, image_input, video_input]
	)

	# Example button handlers
	example_btn1.click(
	fn=set_example_prompt,
	inputs=[gr.State("Hello, who are you?")],
	outputs=[msg]
	)

	example_btn2.click(
	fn=set_example_prompt,
	inputs=[gr.State("Please describe this image in detail.")],
	outputs=[msg]
	)

	example_btn3.click(
	fn=set_example_prompt,
	inputs=[gr.State("Extract the exact text provided in the image.")],
	outputs=[msg]
	)

	example_btn4.click(
	fn=set_example_prompt,
	inputs=[gr.State("Describe this video in detail.")],
	outputs=[msg]
	)

	# Footer
	gr.Markdown("""
	---
	About InternVL2.5-4B: A powerful multimodal AI model developed by Shanghai AI Lab, Tsinghua University and partners.

	API Usage: This interface supports API calls. The chat endpoint accepts JSON with `message`, `image`, and `video` fields.
	""")

	return interface

	# API endpoint for external integrations
	def api_chat(message: str, image: Optional[str] = None, video: Optional[str] = None, history: Optional[List] = None):
	"""
	API endpoint for chat functionality

	Args:
	message: Text message
	image: Base64 encoded image or image path
	video: Video file path
	history: Chat history as list of [user_msg, bot_msg] pairs

	Returns:
	Dictionary with response and updated history
	"""
	try:
	if history is None:
	history = []

	# Process image if provided (handle base64 or file path)
	image_obj = None
	if image:
	try:
	if image.startswith('data:image'):
	# Handle base64 image
	import base64
	from io import BytesIO
	image_data = image.split(',')[1]
	image_bytes = base64.b64decode(image_data)
	image_obj = Image.open(BytesIO(image_bytes))
	else:
	# Handle file path
	image_obj = Image.open(image)
	except Exception as e:
	logger.error(f"Error processing image: {str(e)}")

	# Chat with the model
	_, updated_history, _, _ = chatbot.chat(message, history, image_obj, video)

	return {
	"response": updated_history[-1][1] if updated_history else "",
	"history": updated_history,
	"status": "success"
	}
	except Exception as e:
	logger.error(f"API Error: {str(e)}")
	return {
	"response": f"Error: {str(e)}",
	"history": history,
	"status": "error"
	}

	if __name__ == "__main__":
	# Create and launch the interface
	interface = create_interface()

	# Launch with API access enabled
	interface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	show_api=True,
	)