import gradio as gr from PIL import Image import os import time import numpy as np import torch import warnings import stat import subprocess import sys import asyncio import nest_asyncio # Apply nest_asyncio to allow nested event loops nest_asyncio.apply() # Set environment variables os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" # Print system information print(f"Python version: {sys.version}") print(f"PyTorch version: {torch.__version__}") print(f"CUDA available via PyTorch: {torch.cuda.is_available()}") print(f"CUDA version via PyTorch: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}") # Try to run nvidia-smi def run_nvidia_smi(): try: result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if result.returncode == 0: print("nvidia-smi output:") print(result.stdout) return True else: print("nvidia-smi error:") print(result.stderr) return False except Exception as e: print(f"Error running nvidia-smi: {str(e)}") return False # Run nvidia-smi nvidia_smi_available = run_nvidia_smi() print(f"nvidia-smi available: {nvidia_smi_available}") # Show CUDA devices if torch.cuda.is_available(): print(f"CUDA device count: {torch.cuda.device_count()}") for i in range(torch.cuda.device_count()): print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}") print(f"Current CUDA device: {torch.cuda.current_device()}") # Ensure all cache directories exist with proper permissions def setup_cache_directories(): # Gradio cache directory cache_dir = os.path.join(os.getcwd(), "gradio_cached_examples") os.makedirs(cache_dir, exist_ok=True) # HuggingFace cache directories hf_cache = os.path.join(os.getcwd(), ".cache", "huggingface") transformers_cache = os.path.join(hf_cache, "transformers") os.makedirs(hf_cache, exist_ok=True) os.makedirs(transformers_cache, exist_ok=True) # Set permissions try: for directory in [cache_dir, hf_cache, transformers_cache]: if os.path.exists(directory): os.chmod(directory, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO) # 0o777 print(f"Set permissions for {directory}") except Exception as e: print(f"Warning: Could not set permissions: {str(e)}") return cache_dir # Set up cache directories cache_dir = setup_cache_directories() # Suppress specific warnings that might be caused by package version mismatches warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") warnings.filterwarnings("ignore", message=".*Torch is not compiled with CUDA enabled.*") warnings.filterwarnings("ignore", category=UserWarning) # Check for actual GPU availability def check_gpu_availability(): """Check if GPU is actually available and working""" print("Checking GPU availability...") if not torch.cuda.is_available(): print("CUDA is not available in PyTorch") return False try: # Try to initialize CUDA and run a simple operation print("Attempting to create a tensor on CUDA...") x = torch.rand(10, device="cuda") y = x + x print("Successfully created and operated on CUDA tensor") return True except Exception as e: print(f"GPU initialization failed: {str(e)}") return False # Global variables internvl2_pipeline = None MODEL_LOADED = False USE_GPU = check_gpu_availability() if USE_GPU: print("GPU is available and working properly") else: print("WARNING: GPU is not available or not working properly. This application requires GPU acceleration.") # Check if lmdeploy is available and try to import try: from lmdeploy import pipeline, TurbomindEngineConfig LMDEPLOY_AVAILABLE = True print("Successfully imported lmdeploy") except ImportError as e: LMDEPLOY_AVAILABLE = False print(f"lmdeploy import failed: {str(e)}. Will use a placeholder for demos.") # Model configuration MODEL_ID = "OpenGVLab/InternVL2-40B-AWQ" # 4-bit quantized model def load_internvl2_model(): """Load the InternVL2 model using lmdeploy""" global internvl2_pipeline, MODEL_LOADED # If already loaded, return if internvl2_pipeline is not None: return True # If lmdeploy is not available, we'll use a demo placeholder if not LMDEPLOY_AVAILABLE: print("lmdeploy not available. Using demo placeholder.") MODEL_LOADED = False return False # Check if GPU is available if not USE_GPU: print("Cannot load InternVL2 model without GPU acceleration.") MODEL_LOADED = False return False print("Loading InternVL2 model...") try: # Force synchronous execution for everything import os # Set environment variables to force synchronous behavior os.environ["CUDA_LAUNCH_BLOCKING"] = "1" # Disable asyncio in lmdeploy os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1" # Configure for AWQ quantized model backend_config = TurbomindEngineConfig( model_format='awq', session_len=2048 # Explicitly set session length ) # Create a synchronous pipeline to avoid asyncio issues # Explicitly set all parameters that might default to async behavior internvl2_pipeline = pipeline( MODEL_ID, backend_config=backend_config, log_level='INFO', model_name_or_path=None, backend_name="turbomind", stream=False, # Important: disable streaming tensor_parallel=1, # Use single GPU to avoid distributed processing ) print("InternVL2 model loaded successfully!") MODEL_LOADED = True return True except Exception as e: print(f"Error loading InternVL2 model: {str(e)}") if "CUDA out of memory" in str(e): print("Not enough GPU memory for the model") elif "Found no NVIDIA driver" in str(e): print("NVIDIA GPU driver not found or not properly configured") MODEL_LOADED = False return False def analyze_image(image, prompt): """Analyze the image using InternVL2 model""" try: start_time = time.time() # Skip model loading if lmdeploy is not available if not LMDEPLOY_AVAILABLE: return ("This is a demo placeholder. The actual model couldn't be loaded because lmdeploy " "is not properly installed. Check your installation and dependencies.") # Check for GPU if not USE_GPU: return ("ERROR: This application requires a GPU to run InternVL2. " "The NVIDIA driver was not detected on this system. " "Please make sure this Space is using a GPU-enabled instance and that the GPU is correctly initialized.") # Make sure the model is loaded if not load_internvl2_model(): return "Couldn't load InternVL2 model. See logs for details." # Convert numpy array to PIL Image if isinstance(image, np.ndarray): image_pil = Image.fromarray(image).convert('RGB') else: # If somehow it's already a PIL Image image_pil = image.convert('RGB') # We'll use a completely different approach - multiprocessing # This runs the model in a separate process, avoiding any event loop conflicts import multiprocessing as mp # Define a function to run in a separate process def run_in_process(prompt, image_path, result_queue): try: # Set environment variables in the subprocess import os os.environ["CUDA_LAUNCH_BLOCKING"] = "1" os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1" # Import libraries inside the process from lmdeploy import pipeline, TurbomindEngineConfig # Save the image to a temporary file to pass between processes import tempfile import torch # Check GPU in subprocess print(f"Subprocess GPU available: {torch.cuda.is_available()}") # Configure for AWQ quantized model backend_config = TurbomindEngineConfig( model_format='awq', session_len=2048 ) # Create new pipeline in the subprocess model_pipeline = pipeline( MODEL_ID, backend_config=backend_config, log_level='INFO', model_name_or_path=None, backend_name="turbomind", stream=False, tensor_parallel=1, ) # Load the image in the subprocess from PIL import Image image = Image.open(image_path).convert('RGB') # Run inference response = model_pipeline((prompt, image)) result = response.text if hasattr(response, "text") else str(response) # Put the result in the queue result_queue.put(("success", result)) except Exception as e: import traceback error_msg = f"Error in subprocess: {str(e)}\n{traceback.format_exc()}" result_queue.put(("error", error_msg)) # Create a temporary file for the image import tempfile with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file: temp_path = temp_file.name image_pil.save(temp_path) try: # Create a process-safe queue result_queue = mp.Queue() # Start the process print("Starting model inference in a separate process") process = mp.Process( target=run_in_process, args=(prompt, temp_path, result_queue) ) # Make it a daemon so it terminates when the main process ends process.daemon = True process.start() # Wait for the process to complete (with timeout) process.join(timeout=180) # 3 minute timeout # Delete the temporary file try: os.unlink(temp_path) except: pass if process.is_alive(): # Terminate the process if it's still running after timeout process.terminate() return "Model inference timed out after 180 seconds. The model might be too slow on this hardware." # Get the result from the queue (non-blocking to avoid hanging) if not result_queue.empty(): status, result = result_queue.get(block=False) if status == "error": return f"Error in model inference: {result}" else: elapsed_time = time.time() - start_time return result else: return "Unknown error: Model inference process completed but did not produce a result" except Exception as e: print(f"Error in multiprocessing: {str(e)}") return f"Error setting up multiprocessing: {str(e)}" except Exception as e: print(f"Error in image analysis: {str(e)}") # Try to clean up memory in case of error if USE_GPU: torch.cuda.empty_cache() return f"Error in image analysis: {str(e)}" def process_image(image, analysis_type="general"): """Process the image and return the analysis""" if image is None: return "Please upload an image." # Define prompt based on analysis type if analysis_type == "general": prompt = "Describe this image in detail." elif analysis_type == "text": prompt = "What text can you see in this image? Please transcribe it accurately." elif analysis_type == "chart": prompt = "Analyze any charts, graphs or diagrams in this image in detail, including trends, data points, and conclusions." elif analysis_type == "people": prompt = "Describe the people in this image - their appearance, actions, and expressions." elif analysis_type == "technical": prompt = "Provide a technical analysis of this image, including object identification, spatial relationships, and any technical elements present." else: prompt = "Describe this image in detail." start_time = time.time() # Get analysis from the model analysis = analyze_image(image, prompt) elapsed_time = time.time() - start_time return f"{analysis}\n\nAnalysis completed in {elapsed_time:.2f} seconds." # Define the Gradio interface def create_interface(): with gr.Blocks(title="Image Analysis with InternVL2") as demo: gr.Markdown("# Image Analysis with InternVL2-40B") # System diagnostics system_info = f""" ## System Diagnostics: - PyTorch Version: {torch.__version__} - CUDA Available: {torch.cuda.is_available()} - GPU Working: {USE_GPU} - nvidia-smi Available: {nvidia_smi_available} """ gr.Markdown(system_info) gr.Markdown("Upload an image to analyze it using the InternVL2-40B model.") # Show warnings based on system status if not LMDEPLOY_AVAILABLE: gr.Markdown("⚠️ **WARNING**: lmdeploy is not properly installed. This demo will not function correctly.", elem_classes=["warning-message"]) if not USE_GPU: gr.Markdown("🚫 **ERROR**: NVIDIA GPU not detected. This application requires GPU acceleration to run InternVL2 model.", elem_classes=["error-message"]) with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(type="pil", label="Upload Image") analysis_type = gr.Radio( ["general", "text", "chart", "people", "technical"], label="Analysis Type", value="general" ) submit_btn = gr.Button("Analyze Image") # Disable button if GPU is not available if not USE_GPU: submit_btn.interactive = False with gr.Column(scale=2): output_text = gr.Textbox(label="Analysis Result", lines=20) if not USE_GPU: output_text.value = f"""ERROR: NVIDIA GPU driver not detected. This application requires GPU acceleration to run the InternVL2 model. Diagnostics: - PyTorch Version: {torch.__version__} - CUDA Available via PyTorch: {torch.cuda.is_available()} - nvidia-smi Available: {nvidia_smi_available} - GPU Working: {USE_GPU} Please ensure this Space is using a GPU-enabled instance and that the GPU is correctly initialized.""" submit_btn.click( fn=process_image, inputs=[input_image, analysis_type], outputs=output_text ) gr.Markdown(""" ## Analysis Types - **General**: General description of the image - **Text**: Focus on identifying and transcribing text in the image - **Chart**: Detailed analysis of charts, graphs, and diagrams - **People**: Description of people, their appearance and actions - **Technical**: Technical analysis identifying objects and spatial relationships """) # Hardware requirements notice gr.Markdown(""" ## System Requirements This application requires: - NVIDIA GPU with CUDA support - At least 16GB of GPU memory recommended - GPU drivers properly installed and configured If you're running this on Hugging Face Spaces, make sure to select a GPU-enabled hardware type. """) # Examples try: gr.Examples( examples=[ ["data_temp/page_2.png", "general"], ["data_temp/page_2.png", "text"], ["data_temp/page_2.png", "chart"] ], inputs=[input_image, analysis_type], outputs=output_text, fn=process_image, cache_examples=True ) except Exception as e: print(f"Warning: Could not load examples: {str(e)}") return demo # Main function if __name__ == "__main__": # Create the Gradio interface demo = create_interface() # Launch the interface (removed incompatible parameters) demo.launch(share=False, server_name="0.0.0.0")