Spaces:

mknolan
/

internvl2-chat-analyzer

Paused

App Files Files Community

mknolan commited on Mar 16

Commit

6131f9b

verified ·

1 Parent(s): f952993

Fix example images URLs to prevent 404 errors

Browse files

Files changed (1) hide show

app.py +111 -84

app.py CHANGED Viewed

@@ -6,11 +6,9 @@ from PIL import Image
 import traceback
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig
-import importlib.util
-import importlib.machinery
 print("=" * 50)
-print("InternVL2-8B IMAGE & TEXT ANALYSIS")
 print("=" * 50)
 # System information
@@ -31,88 +29,86 @@ if torch.cuda.is_available():
 else:
     print("CUDA is not available. This application requires GPU acceleration.")
-# Create a proper flash_attn mock module before loading the model
 def setup_flash_attn_mock():
-    # Create a more complete mock for flash_attn
-    print("Setting up a proper flash_attn mock...")
-    # First, remove any existing flash_attn module if it exists
     if "flash_attn" in sys.modules:
-        del sys.modules["flash_attn"]
-    # Create a simple Python file with flash_attn mock code
-    flash_attn_path = os.path.join(os.getcwd(), "flash_attn.py")
-    with open(flash_attn_path, "w") as f:
-        f.write("""
-# Mock flash_attn module
-__version__ = "0.0.0-disabled"
-def flash_attn_func(*args, **kwargs):
-    raise NotImplementedError("This is a mock flash_attn implementation")
-def flash_attn_kvpacked_func(*args, **kwargs):
-    raise NotImplementedError("This is a mock flash_attn implementation")
-def flash_attn_qkvpacked_func(*args, **kwargs):
-    raise NotImplementedError("This is a mock flash_attn implementation")
-# Add any other functions that might be needed
-""")
-    # Load the mock module properly with spec
-    spec = importlib.util.spec_from_file_location("flash_attn", flash_attn_path)
-    flash_attn_module = importlib.util.module_from_spec(spec)
-    sys.modules["flash_attn"] = flash_attn_module
-    spec.loader.exec_module(flash_attn_module)
-    # Now also create the flash_attn_2_cuda if needed
-    if "flash_attn_2_cuda" not in sys.modules:
-        flash_attn_2_path = os.path.join(os.getcwd(), "flash_attn_2_cuda.py")
-        with open(flash_attn_2_path, "w") as f:
-            f.write("# Mock flash_attn_2_cuda module\n")
-        spec_cuda = importlib.util.spec_from_file_location("flash_attn_2_cuda", flash_attn_2_path)
-        flash_attn_2_cuda_module = importlib.util.module_from_spec(spec_cuda)
-        sys.modules["flash_attn_2_cuda"] = flash_attn_2_cuda_module
-        spec_cuda.loader.exec_module(flash_attn_2_cuda_module)
-    print("Flash-attention mock modules set up successfully")
 # Create a function to load the model
 def load_model():
     try:
-        print("\nLoading InternVL2-8B model...")
-        # Set up proper mock modules for flash_attn
         setup_flash_attn_mock()
-        # Disable flash attention in transformers by patching environment vars
-        os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
-        os.environ["TRANSFORMERS_OFFLINE"] = "1"  # Avoid online checks for flash_attn
         # Load the model and tokenizer
         model_path = "OpenGVLab/InternVL2-8B"
-        print("Loading tokenizer...")
-        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        print("Loading model (this may take a while)...")
-        # Add specific flags to avoid flash_attn usage
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-            trust_remote_code=True,
-            use_flash_attention_2=False,  # Explicitly disable flash attention
-            attn_implementation="eager"    # Use eager implementation instead
         )
-        # Define generation config
-        generation_config = GenerationConfig(
-            max_new_tokens=512,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.8,
-            repetition_penalty=1.0
         )
         print("✓ Model and tokenizer loaded successfully!")
@@ -127,24 +123,55 @@ def load_model():
 def load_image(image_path, processor=None):
     """Load an image and prepare it for the model."""
     if isinstance(image_path, str):
-        image = Image.open(image_path).convert('RGB')
     else:
         image = image_path
-    # The model handles image processing internally
     return image
 # Function to analyze an image with text
 def analyze_image(model, tokenizer, image, prompt, generation_config):
     try:
-        # Process the conversation
-        messages = [
-            {"role": "user", "content": f"{prompt}", "image": image}
-        ]
         # Generate a response
-        response = model.chat(tokenizer, messages=messages, generation_config=generation_config)
-        return response
     except Exception as e:
         error_msg = f"Error analyzing image: {str(e)}"
@@ -158,7 +185,7 @@ def create_interface():
     if model is None:
         # If model loading failed, create a simple error interface
-        with gr.Blocks(title="InternVL2 Chat - Error") as demo:
             gr.Markdown("# ❌ Error: Failed to load models")
             gr.Markdown("Please check the console for error details.")
         return demo
@@ -166,18 +193,18 @@ def create_interface():
     # Predefined prompts for analysis
     prompts = [
         "Describe this image in detail.",
-        "What text appears in this image? Please read and transcribe it accurately.",
-        "Analyze the content of this image, including any text, pictures, and their relationships.",
         "What is the main subject of this image?",
-        "Is there any text in this image? If so, what does it say?",
-        "Describe the layout and visual elements of this document.",
-        "Summarize the key information presented in this image."
     ]
     # Create the full interface
     with gr.Blocks(title="InternVL2 Image Analysis") as demo:
-        gr.Markdown("# 🖼️ InternVL2-8B Image & Text Analyzer")
-        gr.Markdown("### Upload an image to analyze its visual content and text")
         with gr.Row():
             with gr.Column(scale=1):
@@ -194,11 +221,11 @@ def create_interface():
             with gr.Column(scale=1):
                 output = gr.Textbox(label="Analysis Results", lines=15)
-        # Example images
         gr.Examples(
             examples=[
-                ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/blip-image-demo.png", "What's in this image?"],
-                ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assets/130_vision_language_pretraining/fig_vision_language.jpg", "Describe this diagram in detail."],
             ],
             inputs=[input_image, custom_prompt],
         )
@@ -232,4 +259,4 @@ if __name__ == "__main__":
     # Create and launch the interface
     demo = create_interface()
-    demo.launch(share=False, server_name="0.0.0.0")

 import traceback
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig
 print("=" * 50)
+print("InternVL2 IMAGE & TEXT ANALYSIS")
 print("=" * 50)
 # System information
 else:
     print("CUDA is not available. This application requires GPU acceleration.")
+# Create a mock function for flash_attn modules
 def setup_flash_attn_mock():
+    # Disable flash attention in transformers
+    os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
+    # First, check if flash_attn is already imported
     if "flash_attn" in sys.modules:
+        print("flash_attn module already imported - no mocking needed")
+        return
+    # If we should mock the module
+    print("Setting up flash_attn mock...")
+    # Create a proper mock that has the necessary attributes
+    class FlashAttnMock:
+        __version__ = "0.0.0-disabled-mock"
+        def __init__(self):
+            pass
+        def flash_attn_func(self, *args, **kwargs):
+            raise NotImplementedError("This is a mock flash_attn implementation")
+        def flash_attn_kvpacked_func(self, *args, **kwargs):
+            raise NotImplementedError("This is a mock flash_attn implementation")
+        def flash_attn_qkvpacked_func(self, *args, **kwargs):
+            raise NotImplementedError("This is a mock flash_attn implementation")
+    # Create the module with proper spec
+    import types
+    flash_attn_mock = FlashAttnMock()
+    sys.modules["flash_attn"] = flash_attn_mock
+    print("flash_attn mock set up successfully")
+    # Also mock the related modules that might be imported
+    sys.modules["flash_attn.flash_attn_interface"] = types.ModuleType("flash_attn.flash_attn_interface")
+    sys.modules["flash_attn.flash_attn_triton"] = types.ModuleType("flash_attn.flash_attn_triton")
+    # Check if it worked
+    try:
+        import flash_attn
+        print(f"Mock flash_attn module version: {flash_attn.__version__}")
+    except:
+        print("Warning: flash_attn mock failed to load correctly")
 # Create a function to load the model
 def load_model():
     try:
+        print("\nLoading InternVL2 model...")
+        # Setup flash_attn mock
         setup_flash_attn_mock()
         # Load the model and tokenizer
         model_path = "OpenGVLab/InternVL2-8B"
+        # Print downloading status
+        print("Downloading model shards. This may take some time...")
+        # Load the model
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            low_cpu_mem_usage=True,
+            device_map="auto" if torch.cuda.is_available() else None,
+            trust_remote_code=True
         )
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            use_fast=False,
+            trust_remote_code=True
+        )
+        # Set generation config
+        generation_config = GenerationConfig.from_pretrained(
+            model_path,
+            trust_remote_code=True
         )
         print("✓ Model and tokenizer loaded successfully!")
 def load_image(image_path, processor=None):
     """Load an image and prepare it for the model."""
     if isinstance(image_path, str):
+        if image_path.startswith('http'):
+            import requests
+            from io import BytesIO
+            try:
+                response = requests.get(image_path, timeout=10)
+                image = Image.open(BytesIO(response.content)).convert('RGB')
+            except Exception as e:
+                print(f"Error loading image from URL: {e}")
+                # Return a default image or raise an error
+                image = Image.new('RGB', (224, 224), color='gray')
+        else:
+            image = Image.open(image_path).convert('RGB')
     else:
         image = image_path
+    # No need to process, the model handles that internally
     return image
 # Function to analyze an image with text
 def analyze_image(model, tokenizer, image, prompt, generation_config):
     try:
+        # Prepare inputs
+        text_prompt = f"USER: <image>\n{prompt}\nASSISTANT:"
+        # Convert inputs for the model
+        inputs = tokenizer([text_prompt], return_tensors="pt")
+        # Move inputs to the right device
+        if torch.cuda.is_available():
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        # Add image to the inputs
+        inputs["images"] = [image]
         # Generate a response
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                generation_config=generation_config,
+                max_new_tokens=512,
+            )
+        # Decode the outputs
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the assistant's response
+        assistant_response = generated_text.split("ASSISTANT:")[-1].strip()
+        return assistant_response
     except Exception as e:
         error_msg = f"Error analyzing image: {str(e)}"
     if model is None:
         # If model loading failed, create a simple error interface
+        with gr.Blocks(title="InternVL2 Image Analysis - Error") as demo:
             gr.Markdown("# ❌ Error: Failed to load models")
             gr.Markdown("Please check the console for error details.")
         return demo
     # Predefined prompts for analysis
     prompts = [
         "Describe this image in detail.",
+        "What can you tell me about this image?",
+        "Is there any text in this image? If so, can you read it?",
         "What is the main subject of this image?",
+        "What emotions or feelings does this image convey?",
+        "Describe the composition and visual elements of this image.",
+        "Summarize what you see in this image in one paragraph."
     ]
     # Create the full interface
     with gr.Blocks(title="InternVL2 Image Analysis") as demo:
+        gr.Markdown("# 🖼️ InternVL2 Image & Text Analyzer")
+        gr.Markdown("### Upload an image and ask questions about it")
         with gr.Row():
             with gr.Column(scale=1):
             with gr.Column(scale=1):
                 output = gr.Textbox(label="Analysis Results", lines=15)
+        # Example images - UPDATED with more reliable image URLs
         gr.Examples(
             examples=[
+                ["https://github.com/huggingface/transformers/raw/main/docs/source/en/model_doc/blip-2_files/BobRoss.jpg", "What's in this image?"],
+                ["https://raw.githubusercontent.com/openai/CLIP/main/CLIP.png", "Describe this diagram in detail."],
             ],
             inputs=[input_image, custom_prompt],
         )
     # Create and launch the interface
     demo = create_interface()
+    demo.launch(share=False, server_name="0.0.0.0")