import os import sys import torch import tempfile from PIL import Image import gradio as gr import pdf2image from transformers import AutoModel, AutoTokenizer import torchvision.transforms as transforms # Configuration MODEL_NAME = "OpenGVLab/InternVL2_5-8B" IMAGE_SIZE = 448 # Model loading function def load_model(): print(f"\n=== Loading {MODEL_NAME} ===") print(f"CUDA available: {torch.cuda.is_available()}") # Set device device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Load model and tokenizer with minimal options to avoid compatibility issues try: model = AutoModel.from_pretrained( MODEL_NAME, trust_remote_code=True, device_map="auto" if torch.cuda.is_available() else None ) tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, use_fast=False, trust_remote_code=True ) print(f"✓ Model and tokenizer loaded successfully!") return model, tokenizer except Exception as e: print(f"❌ Error loading model: {e}") import traceback traceback.print_exc() return None, None # Extract slides from uploaded PDF file def extract_slides_from_pdf(file_obj): try: file_bytes = file_obj.read() file_extension = os.path.splitext(file_obj.name)[1].lower() # Check if it's a PDF if file_extension != '.pdf': return [] # Create temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file: temp_file.write(file_bytes) temp_path = temp_file.name # Extract images from PDF using pdf2image slides = [] try: images = pdf2image.convert_from_path(temp_path, dpi=300) slides = [(f"Slide {i+1}", img) for i, img in enumerate(images)] except Exception as e: print(f"Error converting PDF: {e}") # Clean up temporary file os.unlink(temp_path) return slides except Exception as e: import traceback error_msg = f"Error extracting slides: {str(e)}\n{traceback.format_exc()}" print(error_msg) return [] # Simple preprocessing for a single image def preprocess_image(image): # Resize image to expected size img = image.resize((IMAGE_SIZE, IMAGE_SIZE)) # Convert PIL image to tensor and normalize transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) # Apply transformation and add batch dimension img_tensor = transform(img).unsqueeze(0) # Move tensor to GPU if available if torch.cuda.is_available(): img_tensor = img_tensor.cuda() return img_tensor # Image analysis function - using simple approach def analyze_image(model, tokenizer, image, prompt): try: # Check if image is valid if image is None: return "Please upload an image first." # Process the image with simple preprocessing processed_image = preprocess_image(image) # Simple prompt format question = f"\n{prompt}" # Use the model's chat method response, _ = model.chat( tokenizer=tokenizer, pixel_values=processed_image, question=question, history=None, return_history=True ) return response except Exception as e: import traceback error_msg = f"Error analyzing image: {str(e)}\n{traceback.format_exc()}" return error_msg # Analyze multiple slides from a PDF def analyze_pdf_slides(model, tokenizer, file_obj, prompt, num_slides=2): try: if file_obj is None: return "Please upload a PDF file." # Extract slides from PDF slides = extract_slides_from_pdf(file_obj) if not slides: return "No slides were extracted from the file. Please check that it's a valid PDF." # Limit to the requested number of slides slides = slides[:num_slides] # Analyze each slide analyses = [] for slide_title, slide_image in slides: analysis = analyze_image(model, tokenizer, slide_image, prompt) analyses.append((slide_title, analysis)) # Format the results result = "" for slide_title, analysis in analyses: result += f"## {slide_title}\n\n{analysis}\n\n---\n\n" return result except Exception as e: import traceback error_msg = f"Error analyzing slides: {str(e)}\n{traceback.format_exc()}" return error_msg # Main function def main(): # Load the model model, tokenizer = load_model() if model is None: # Create an error interface if model loading failed demo = gr.Interface( fn=lambda x: "Model loading failed. Please check the logs for details.", inputs=gr.Textbox(), outputs=gr.Textbox(), title="InternVL2.5 Slide Analyzer - Error", description="The model failed to load. Please check the logs for more information." ) return demo # Create a simple interface with gr.Blocks(title="InternVL2.5 PDF Slide Analyzer") as demo: gr.Markdown("# InternVL2.5 PDF Slide Analyzer") gr.Markdown("Upload a PDF file and analyze multiple slides") # PDF Analysis tab slide_prompts = [ "Analyze this slide and describe its contents.", "What is the main message of this slide?", "Extract all the text visible in this slide.", "What are the key points presented in this slide?", "Describe the visual elements and layout of this slide." ] with gr.Row(): file_input = gr.File(label="Upload PDF") slide_prompt = gr.Dropdown( choices=slide_prompts, value=slide_prompts[0], label="Select a prompt", allow_custom_value=True ) num_slides = gr.Slider( minimum=1, maximum=5, value=2, step=1, label="Number of Slides to Analyze" ) slides_analyze_btn = gr.Button("Analyze Slides") slides_output = gr.Markdown(label="Analysis Results") # Handle the slides analysis action slides_analyze_btn.click( fn=lambda file, prompt, num: analyze_pdf_slides(model, tokenizer, file, prompt, num), inputs=[file_input, slide_prompt, num_slides], outputs=slides_output ) # Add example if available if os.path.exists("example_slides/test_slides.pdf"): gr.Examples( examples=[ ["example_slides/test_slides.pdf", "Extract all the text visible in this slide.", 2] ], inputs=[file_input, slide_prompt, num_slides] ) return demo # Run the application if __name__ == "__main__": try: # Create and launch the interface demo = main() demo.launch(server_name="0.0.0.0") except Exception as e: print(f"Error starting the application: {e}") import traceback traceback.print_exc()