Spaces:

mknolan
/

internvl2-chat-analyzer

Paused

App Files Files Community

mknolan commited on Mar 16

Commit

79f6e49

verified ·

1 Parent(s): 6131f9b

Implement pure in-memory flash_attn mock to fix spec error

Browse files

Files changed (1) hide show

app.py +160 -52

app.py CHANGED Viewed

@@ -4,8 +4,10 @@ import sys
 import gradio as gr
 from PIL import Image
 import traceback
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.generation import GenerationConfig
 print("=" * 50)
 print("InternVL2 IMAGE & TEXT ANALYSIS")
@@ -29,74 +31,180 @@ if torch.cuda.is_available():
 else:
     print("CUDA is not available. This application requires GPU acceleration.")
-# Create a mock function for flash_attn modules
-def setup_flash_attn_mock():
-    # Disable flash attention in transformers
-    os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
-    # First, check if flash_attn is already imported
-    if "flash_attn" in sys.modules:
-        print("flash_attn module already imported - no mocking needed")
-        return
-    # If we should mock the module
-    print("Setting up flash_attn mock...")
-    # Create a proper mock that has the necessary attributes
-    class FlashAttnMock:
-        __version__ = "0.0.0-disabled-mock"
-        def __init__(self):
-            pass
-        def flash_attn_func(self, *args, **kwargs):
-            raise NotImplementedError("This is a mock flash_attn implementation")
-        def flash_attn_kvpacked_func(self, *args, **kwargs):
-            raise NotImplementedError("This is a mock flash_attn implementation")
-        def flash_attn_qkvpacked_func(self, *args, **kwargs):
-            raise NotImplementedError("This is a mock flash_attn implementation")
-    # Create the module with proper spec
-    import types
-    flash_attn_mock = FlashAttnMock()
-    sys.modules["flash_attn"] = flash_attn_mock
-    print("flash_attn mock set up successfully")
-    # Also mock the related modules that might be imported
-    sys.modules["flash_attn.flash_attn_interface"] = types.ModuleType("flash_attn.flash_attn_interface")
-    sys.modules["flash_attn.flash_attn_triton"] = types.ModuleType("flash_attn.flash_attn_triton")
-    # Check if it worked
     try:
         import flash_attn
-        print(f"Mock flash_attn module version: {flash_attn.__version__}")
-    except:
-        print("Warning: flash_attn mock failed to load correctly")
 # Create a function to load the model
 def load_model():
     try:
         print("\nLoading InternVL2 model...")
-        # Setup flash_attn mock
-        setup_flash_attn_mock()
         # Load the model and tokenizer
         model_path = "OpenGVLab/InternVL2-8B"
         # Print downloading status
         print("Downloading model shards. This may take some time...")
-        # Load the model
-        model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-            low_cpu_mem_usage=True,
-            device_map="auto" if torch.cuda.is_available() else None,
-            trust_remote_code=True
-        )
         # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(
@@ -221,11 +329,11 @@ def create_interface():
             with gr.Column(scale=1):
                 output = gr.Textbox(label="Analysis Results", lines=15)
-        # Example images - UPDATED with more reliable image URLs
         gr.Examples(
             examples=[
-                ["https://github.com/huggingface/transformers/raw/main/docs/source/en/model_doc/blip-2_files/BobRoss.jpg", "What's in this image?"],
-                ["https://raw.githubusercontent.com/openai/CLIP/main/CLIP.png", "Describe this diagram in detail."],
             ],
             inputs=[input_image, custom_prompt],
         )

 import gradio as gr
 from PIL import Image
 import traceback
+import types
+import importlib.util
+import importlib.machinery
+import importlib.abc
 print("=" * 50)
 print("InternVL2 IMAGE & TEXT ANALYSIS")
 else:
     print("CUDA is not available. This application requires GPU acceleration.")
+# In-memory mock implementation
+def create_in_memory_flash_attn_mock():
+    """Create a completely in-memory flash_attn mock with all required attributes"""
+    print("Setting up in-memory flash_attn mock...")
+    # Create a dummy module finder and loader for the mock
+    class DummyFinder(importlib.abc.MetaPathFinder):
+        def find_spec(self, fullname, path, target=None):
+            if fullname == 'flash_attn' or fullname.startswith('flash_attn.'):
+                return self.create_spec(fullname)
+            elif fullname == 'flash_attn_2_cuda':
+                return self.create_spec(fullname)
+            return None
+        def create_spec(self, fullname):
+            # Create a spec
+            loader = DummyLoader(fullname)
+            spec = importlib.machinery.ModuleSpec(
+                name=fullname,
+                loader=loader,
+                is_package=fullname.count('.') == 0 or fullname.split('.')[-1] == ''
+            )
+            return spec
+    class DummyLoader(importlib.abc.Loader):
+        def __init__(self, fullname):
+            self.fullname = fullname
+        def create_module(self, spec):
+            module = types.ModuleType(spec.name)
+            # Set default attributes for any module
+            module.__spec__ = spec
+            module.__loader__ = self
+            module.__file__ = f"<{spec.name}>"
+            module.__path__ = []
+            module.__package__ = spec.name.rpartition('.')[0] if '.' in spec.name else ''
+            if spec.name == 'flash_attn':
+                # Add flash_attn-specific attributes
+                module.__version__ = "0.0.0-mocked"
+                # Add flash_attn functions
+                module.flash_attn_func = lambda *args, **kwargs: None
+                module.flash_attn_kvpacked_func = lambda *args, **kwargs: None
+                module.flash_attn_qkvpacked_func = lambda *args, **kwargs: None
+            return module
+        def exec_module(self, module):
+            # Nothing to execute
+            pass
+    # Remove any existing modules to avoid conflicts
+    for name in list(sys.modules.keys()):
+        if name == 'flash_attn' or name.startswith('flash_attn.') or name == 'flash_attn_2_cuda':
+            del sys.modules[name]
+    # Register our finder at the beginning of meta_path
+    sys.meta_path.insert(0, DummyFinder())
+    # Pre-create and configure the flash_attn module
+    spec = importlib.machinery.ModuleSpec(
+        name='flash_attn',
+        loader=DummyLoader('flash_attn'),
+        is_package=True
+    )
+    flash_attn = importlib.util.module_from_spec(spec)
+    sys.modules['flash_attn'] = flash_attn
+    # Add attributes used by transformers checks
+    flash_attn.__version__ = "0.0.0-mocked"
+    # Create common submodules
+    for submodule in ['flash_attn.flash_attn_interface', 'flash_attn.flash_attn_triton']:
+        parts = submodule.split('.')
+        parent_name = '.'.join(parts[:-1])
+        child_name = parts[-1]
+        parent = sys.modules[parent_name]
+        # Create submodule spec
+        subspec = importlib.machinery.ModuleSpec(
+            name=submodule,
+            loader=DummyLoader(submodule),
+            is_package=False,
+            parent=parent
+        )
+        # Create and register submodule
+        module = importlib.util.module_from_spec(subspec)
+        setattr(parent, child_name, module)
+        sys.modules[submodule] = module
+    # Create flash_attn_2_cuda module
+    cuda_spec = importlib.machinery.ModuleSpec(
+        name='flash_attn_2_cuda',
+        loader=DummyLoader('flash_attn_2_cuda'),
+        is_package=False
+    )
+    cuda_module = importlib.util.module_from_spec(cuda_spec)
+    sys.modules['flash_attn_2_cuda'] = cuda_module
+    # Set environment variables to disable flash attention
+    os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
+    os.environ["TRANSFORMERS_OFFLINE"] = "1"  # Avoid online checks
+    # Verify the mock was created successfully
     try:
         import flash_attn
+        print(f"✓ Mock flash_attn loaded successfully: {flash_attn.__version__}")
+        print(f"✓ flash_attn.__spec__ exists: {flash_attn.__spec__ is not None}")
+        # Let's explicitly check for __spec__ in importlib.util.find_spec
+        spec = importlib.util.find_spec("flash_attn")
+        print(f"✓ importlib.util.find_spec returns: {spec is not None}")
+        # Check that parent/child relationships work
+        import flash_attn.flash_attn_interface
+        print("✓ flash_attn.flash_attn_interface loaded")
+        # Check CUDA module
+        import flash_attn_2_cuda
+        print("✓ flash_attn_2_cuda loaded")
+    except Exception as e:
+        print(f"WARNING: Error verifying flash_attn mock: {e}")
+        traceback.print_exc()
+# Now set up the mock BEFORE importing transformers
+create_in_memory_flash_attn_mock()
+# Import transformers AFTER setting up mock
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
 # Create a function to load the model
 def load_model():
     try:
         print("\nLoading InternVL2 model...")
         # Load the model and tokenizer
         model_path = "OpenGVLab/InternVL2-8B"
         # Print downloading status
         print("Downloading model shards. This may take some time...")
+        # Load the model - with careful error handling
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+                low_cpu_mem_usage=True,
+                device_map="auto" if torch.cuda.is_available() else None,
+                trust_remote_code=True
+            )
+        except Exception as e:
+            # If we get the flash_attn error, print detailed information
+            if "flash_attn.__spec__ is not set" in str(e):
+                print("\n❌ Flash attention error detected!")
+                # See if our mock is still in place
+                if 'flash_attn' in sys.modules:
+                    mock = sys.modules['flash_attn']
+                    print(f"Flash mock exists: {mock}")
+                    print(f"Flash mock __spec__: {getattr(mock, '__spec__', 'NOT SET')}")
+                else:
+                    print("flash_attn module was removed from sys.modules")
+                # Diagnostic info
+                print("\nCurrent state of sys.meta_path:")
+                for i, finder in enumerate(sys.meta_path):
+                    print(f"  {i}: {finder.__class__.__name__}")
+            # Re-raise the exception
+            raise
         # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(
             with gr.Column(scale=1):
                 output = gr.Textbox(label="Analysis Results", lines=15)
+        # Example images - Using stable URLs from GitHub repositories
         gr.Examples(
             examples=[
+                ["https://raw.githubusercontent.com/gradio-app/gradio/main/demo/kitchen_sink/files/cheetah1.jpg", "What's in this image?"],
+                ["https://raw.githubusercontent.com/gradio-app/gradio/main/demo/kitchen_sink/files/lion.jpg", "Describe this animal."],
             ],
             inputs=[input_image, custom_prompt],
         )

Implement pure in-memory flash_attn mock to fix __spec__ error

Implement pure in-memory flash_attn mock to fix spec error