import gradio as gr
from PIL import Image
import os
import time
import numpy as np
import torch
import warnings
import stat
import subprocess
import sys
import asyncio
import nest_asyncio

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Set environment variables
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Print system information
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available via PyTorch: {torch.cuda.is_available()}")
print(f"CUDA version via PyTorch: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}")

# Try to run nvidia-smi
def run_nvidia_smi():
    try:
        result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if result.returncode == 0:
            print("nvidia-smi output:")
            print(result.stdout)
            return True
        else:
            print("nvidia-smi error:")
            print(result.stderr)
            return False
    except Exception as e:
        print(f"Error running nvidia-smi: {str(e)}")
        return False

# Run nvidia-smi
nvidia_smi_available = run_nvidia_smi()
print(f"nvidia-smi available: {nvidia_smi_available}")

# Show CUDA devices
if torch.cuda.is_available():
    print(f"CUDA device count: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}")
    print(f"Current CUDA device: {torch.cuda.current_device()}")

# Ensure all cache directories exist with proper permissions
def setup_cache_directories():
    # Gradio cache directory
    cache_dir = os.path.join(os.getcwd(), "gradio_cached_examples")
    os.makedirs(cache_dir, exist_ok=True)
    
    # HuggingFace cache directories
    hf_cache = os.path.join(os.getcwd(), ".cache", "huggingface")
    transformers_cache = os.path.join(hf_cache, "transformers")
    os.makedirs(hf_cache, exist_ok=True)
    os.makedirs(transformers_cache, exist_ok=True)
    
    # Set permissions
    try:
        for directory in [cache_dir, hf_cache, transformers_cache]:
            if os.path.exists(directory):
                os.chmod(directory, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)  # 0o777
                print(f"Set permissions for {directory}")
    except Exception as e:
        print(f"Warning: Could not set permissions: {str(e)}")
    
    return cache_dir

# Set up cache directories
cache_dir = setup_cache_directories()

# Suppress specific warnings that might be caused by package version mismatches
warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
warnings.filterwarnings("ignore", message=".*Torch is not compiled with CUDA enabled.*")
warnings.filterwarnings("ignore", category=UserWarning)

# Check for actual GPU availability
def check_gpu_availability():
    """Check if GPU is actually available and working"""
    print("Checking GPU availability...")
    
    if not torch.cuda.is_available():
        print("CUDA is not available in PyTorch")
        return False
    
    try:
        # Try to initialize CUDA and run a simple operation
        print("Attempting to create a tensor on CUDA...")
        x = torch.rand(10, device="cuda")
        y = x + x
        print("Successfully created and operated on CUDA tensor")
        return True
    except Exception as e:
        print(f"GPU initialization failed: {str(e)}")
        return False

# Global variables
internvl2_pipeline = None
MODEL_LOADED = False
USE_GPU = check_gpu_availability()

if USE_GPU:
    print("GPU is available and working properly")
else:
    print("WARNING: GPU is not available or not working properly. This application requires GPU acceleration.")

# Check if lmdeploy is available and try to import
try:
    from lmdeploy import pipeline, TurbomindEngineConfig
    LMDEPLOY_AVAILABLE = True
    print("Successfully imported lmdeploy")
except ImportError as e:
    LMDEPLOY_AVAILABLE = False
    print(f"lmdeploy import failed: {str(e)}. Will use a placeholder for demos.")

# Model configuration
MODEL_ID = "OpenGVLab/InternVL2-40B-AWQ"  # 4-bit quantized model

def load_internvl2_model():
    """Load the InternVL2 model using lmdeploy"""
    global internvl2_pipeline, MODEL_LOADED
    
    # If already loaded, return
    if internvl2_pipeline is not None:
        return True
    
    # If lmdeploy is not available, we'll use a demo placeholder
    if not LMDEPLOY_AVAILABLE:
        print("lmdeploy not available. Using demo placeholder.")
        MODEL_LOADED = False
        return False

    # Check if GPU is available
    if not USE_GPU:
        print("Cannot load InternVL2 model without GPU acceleration.")
        MODEL_LOADED = False
        return False
        
    print("Loading InternVL2 model...")
    try:
        # Force synchronous execution for everything
        import os
        # Set environment variables to force synchronous behavior
        os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
        # Disable asyncio in lmdeploy
        os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
        
        # Configure for AWQ quantized model
        backend_config = TurbomindEngineConfig(
            model_format='awq',
            session_len=2048  # Explicitly set session length
        )
        
        # Create a synchronous pipeline to avoid asyncio issues
        # Explicitly set all parameters that might default to async behavior
        internvl2_pipeline = pipeline(
            MODEL_ID, 
            backend_config=backend_config,
            log_level='INFO',
            model_name_or_path=None,
            backend_name="turbomind",
            stream=False,  # Important: disable streaming
            tensor_parallel=1,  # Use single GPU to avoid distributed processing
        )
        
        print("InternVL2 model loaded successfully!")
        MODEL_LOADED = True
        return True
    except Exception as e:
        print(f"Error loading InternVL2 model: {str(e)}")
        if "CUDA out of memory" in str(e):
            print("Not enough GPU memory for the model")
        elif "Found no NVIDIA driver" in str(e):
            print("NVIDIA GPU driver not found or not properly configured")
        MODEL_LOADED = False
        return False

def analyze_image(image, prompt):
    """Analyze the image using InternVL2 model"""
    try:
        start_time = time.time()
        
        # Skip model loading if lmdeploy is not available
        if not LMDEPLOY_AVAILABLE:
            return ("This is a demo placeholder. The actual model couldn't be loaded because lmdeploy " 
                   "is not properly installed. Check your installation and dependencies.")
        
        # Check for GPU
        if not USE_GPU:
            return ("ERROR: This application requires a GPU to run InternVL2. "
                  "The NVIDIA driver was not detected on this system. "
                  "Please make sure this Space is using a GPU-enabled instance and that the GPU is correctly initialized.")
        
        # Make sure the model is loaded
        if not load_internvl2_model():
            return "Couldn't load InternVL2 model. See logs for details."
            
        # Convert numpy array to PIL Image
        if isinstance(image, np.ndarray):
            image_pil = Image.fromarray(image).convert('RGB')
        else:
            # If somehow it's already a PIL Image
            image_pil = image.convert('RGB')

        # We'll use a completely different approach - multiprocessing
        # This runs the model in a separate process, avoiding any event loop conflicts
        import multiprocessing as mp
        
        # Define a function to run in a separate process
        def run_in_process(prompt, image_path, result_queue):
            try:
                # Set environment variables in the subprocess
                import os
                os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
                os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
                
                # Import libraries inside the process
                from lmdeploy import pipeline, TurbomindEngineConfig
                
                # Save the image to a temporary file to pass between processes
                import tempfile
                import torch
                
                # Check GPU in subprocess
                print(f"Subprocess GPU available: {torch.cuda.is_available()}")
                
                # Configure for AWQ quantized model
                backend_config = TurbomindEngineConfig(
                    model_format='awq',
                    session_len=2048
                )
                
                # Create new pipeline in the subprocess
                model_pipeline = pipeline(
                    MODEL_ID, 
                    backend_config=backend_config,
                    log_level='INFO',
                    model_name_or_path=None,
                    backend_name="turbomind",
                    stream=False,
                    tensor_parallel=1,
                )
                
                # Load the image in the subprocess
                from PIL import Image
                image = Image.open(image_path).convert('RGB')
                
                # Run inference
                response = model_pipeline((prompt, image))
                result = response.text if hasattr(response, "text") else str(response)
                
                # Put the result in the queue
                result_queue.put(("success", result))
                
            except Exception as e:
                import traceback
                error_msg = f"Error in subprocess: {str(e)}\n{traceback.format_exc()}"
                result_queue.put(("error", error_msg))
        
        # Create a temporary file for the image
        import tempfile
        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
            temp_path = temp_file.name
            image_pil.save(temp_path)
        
        try:
            # Create a process-safe queue
            result_queue = mp.Queue()
            
            # Start the process
            print("Starting model inference in a separate process")
            process = mp.Process(
                target=run_in_process, 
                args=(prompt, temp_path, result_queue)
            )
            
            # Make it a daemon so it terminates when the main process ends
            process.daemon = True
            process.start()
            
            # Wait for the process to complete (with timeout)
            process.join(timeout=180)  # 3 minute timeout
            
            # Delete the temporary file
            try:
                os.unlink(temp_path)
            except:
                pass
                
            if process.is_alive():
                # Terminate the process if it's still running after timeout
                process.terminate()
                return "Model inference timed out after 180 seconds. The model might be too slow on this hardware."
            
            # Get the result from the queue (non-blocking to avoid hanging)
            if not result_queue.empty():
                status, result = result_queue.get(block=False)
                if status == "error":
                    return f"Error in model inference: {result}"
                else:
                    elapsed_time = time.time() - start_time
                    return result
            else:
                return "Unknown error: Model inference process completed but did not produce a result"
            
        except Exception as e:
            print(f"Error in multiprocessing: {str(e)}")
            return f"Error setting up multiprocessing: {str(e)}"
            
    except Exception as e:
        print(f"Error in image analysis: {str(e)}")
        # Try to clean up memory in case of error
        if USE_GPU:
            torch.cuda.empty_cache()
        return f"Error in image analysis: {str(e)}"

def process_image(image, analysis_type="general"):
    """Process the image and return the analysis"""
    if image is None:
        return "Please upload an image."
    
    # Define prompt based on analysis type
    if analysis_type == "general":
        prompt = "Describe this image in detail."
    elif analysis_type == "text":
        prompt = "What text can you see in this image? Please transcribe it accurately."
    elif analysis_type == "chart":
        prompt = "Analyze any charts, graphs or diagrams in this image in detail, including trends, data points, and conclusions."
    elif analysis_type == "people":
        prompt = "Describe the people in this image - their appearance, actions, and expressions."
    elif analysis_type == "technical":
        prompt = "Provide a technical analysis of this image, including object identification, spatial relationships, and any technical elements present."
    else:
        prompt = "Describe this image in detail."
    
    start_time = time.time()
    
    # Get analysis from the model
    analysis = analyze_image(image, prompt)
    
    elapsed_time = time.time() - start_time
    return f"{analysis}\n\nAnalysis completed in {elapsed_time:.2f} seconds."

# Define the Gradio interface
def create_interface():
    with gr.Blocks(title="Image Analysis with InternVL2") as demo:
        gr.Markdown("# Image Analysis with InternVL2-40B")
        
        # System diagnostics
        system_info = f"""
        ## System Diagnostics:
        - PyTorch Version: {torch.__version__}
        - CUDA Available: {torch.cuda.is_available()}
        - GPU Working: {USE_GPU}
        - nvidia-smi Available: {nvidia_smi_available}
        """
        
        gr.Markdown(system_info)
        gr.Markdown("Upload an image to analyze it using the InternVL2-40B model.")
        
        # Show warnings based on system status
        if not LMDEPLOY_AVAILABLE:
            gr.Markdown("⚠️ **WARNING**: lmdeploy is not properly installed. This demo will not function correctly.", elem_classes=["warning-message"])
        
        if not USE_GPU:
            gr.Markdown("🚫 **ERROR**: NVIDIA GPU not detected. This application requires GPU acceleration to run InternVL2 model.", elem_classes=["error-message"])
        
        with gr.Row():
            with gr.Column(scale=1):
                input_image = gr.Image(type="pil", label="Upload Image")
                analysis_type = gr.Radio(
                    ["general", "text", "chart", "people", "technical"],
                    label="Analysis Type",
                    value="general"
                )
                submit_btn = gr.Button("Analyze Image")
                
                # Disable button if GPU is not available
                if not USE_GPU:
                    submit_btn.interactive = False
            
            with gr.Column(scale=2):
                output_text = gr.Textbox(label="Analysis Result", lines=20)
                if not USE_GPU:
                    output_text.value = f"""ERROR: NVIDIA GPU driver not detected. This application requires GPU acceleration to run the InternVL2 model.

Diagnostics:
- PyTorch Version: {torch.__version__}
- CUDA Available via PyTorch: {torch.cuda.is_available()}
- nvidia-smi Available: {nvidia_smi_available}
- GPU Working: {USE_GPU}

Please ensure this Space is using a GPU-enabled instance and that the GPU is correctly initialized."""
        
        submit_btn.click(
            fn=process_image,
            inputs=[input_image, analysis_type],
            outputs=output_text
        )
        
        gr.Markdown("""
        ## Analysis Types
        - **General**: General description of the image
        - **Text**: Focus on identifying and transcribing text in the image
        - **Chart**: Detailed analysis of charts, graphs, and diagrams
        - **People**: Description of people, their appearance and actions
        - **Technical**: Technical analysis identifying objects and spatial relationships
        """)
        
        # Hardware requirements notice
        gr.Markdown("""
        ## System Requirements
        This application requires:
        - NVIDIA GPU with CUDA support
        - At least 16GB of GPU memory recommended
        - GPU drivers properly installed and configured
        
        If you're running this on Hugging Face Spaces, make sure to select a GPU-enabled hardware type.
        """)
        
        # Examples
        try:
            gr.Examples(
                examples=[
                    ["data_temp/page_2.png", "general"],
                    ["data_temp/page_2.png", "text"],
                    ["data_temp/page_2.png", "chart"]
                ],
                inputs=[input_image, analysis_type],
                outputs=output_text,
                fn=process_image,
                cache_examples=True
            )
        except Exception as e:
            print(f"Warning: Could not load examples: {str(e)}")
    
    return demo

# Main function
if __name__ == "__main__":
    # Create the Gradio interface
    demo = create_interface()
    
    # Launch the interface (removed incompatible parameters)
    demo.launch(share=False, server_name="0.0.0.0")