Spaces:

mknolan
/

internvl2-chat-analyzer

Paused

App Files Files Community

mknolan commited on Mar 16

Commit

f952993

verified ·

1 Parent(s): 06d7f3d

Properly mock flash_attn module with spec attribute

Browse files

Files changed (1) hide show

app.py +62 -9

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ from PIL import Image
 import traceback
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig
 print("=" * 50)
 print("InternVL2-8B IMAGE & TEXT ANALYSIS")
@@ -29,28 +31,79 @@ if torch.cuda.is_available():
 else:
     print("CUDA is not available. This application requires GPU acceleration.")
 # Create a function to load the model
 def load_model():
     try:
         print("\nLoading InternVL2-8B model...")
-        # Create a fake flash_attn module to avoid dependency errors
-        import sys
-        import types
-        if "flash_attn" not in sys.modules:
-            flash_attn_module = types.ModuleType("flash_attn")
-            flash_attn_module.__version__ = "0.0.0-disabled"
-            sys.modules["flash_attn"] = flash_attn_module
-            print("Created dummy flash_attn module to avoid dependency error")
         # Load the model and tokenizer
         model_path = "OpenGVLab/InternVL2-8B"
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
             torch_dtype=torch.bfloat16,
             device_map="auto",
-            trust_remote_code=True
         )
         # Define generation config

 import traceback
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig
+import importlib.util
+import importlib.machinery
 print("=" * 50)
 print("InternVL2-8B IMAGE & TEXT ANALYSIS")
 else:
     print("CUDA is not available. This application requires GPU acceleration.")
+# Create a proper flash_attn mock module before loading the model
+def setup_flash_attn_mock():
+    # Create a more complete mock for flash_attn
+    print("Setting up a proper flash_attn mock...")
+    # First, remove any existing flash_attn module if it exists
+    if "flash_attn" in sys.modules:
+        del sys.modules["flash_attn"]
+    # Create a simple Python file with flash_attn mock code
+    flash_attn_path = os.path.join(os.getcwd(), "flash_attn.py")
+    with open(flash_attn_path, "w") as f:
+        f.write("""
+# Mock flash_attn module
+__version__ = "0.0.0-disabled"
+def flash_attn_func(*args, **kwargs):
+    raise NotImplementedError("This is a mock flash_attn implementation")
+def flash_attn_kvpacked_func(*args, **kwargs):
+    raise NotImplementedError("This is a mock flash_attn implementation")
+def flash_attn_qkvpacked_func(*args, **kwargs):
+    raise NotImplementedError("This is a mock flash_attn implementation")
+# Add any other functions that might be needed
+""")
+    # Load the mock module properly with spec
+    spec = importlib.util.spec_from_file_location("flash_attn", flash_attn_path)
+    flash_attn_module = importlib.util.module_from_spec(spec)
+    sys.modules["flash_attn"] = flash_attn_module
+    spec.loader.exec_module(flash_attn_module)
+    # Now also create the flash_attn_2_cuda if needed
+    if "flash_attn_2_cuda" not in sys.modules:
+        flash_attn_2_path = os.path.join(os.getcwd(), "flash_attn_2_cuda.py")
+        with open(flash_attn_2_path, "w") as f:
+            f.write("# Mock flash_attn_2_cuda module\n")
+        spec_cuda = importlib.util.spec_from_file_location("flash_attn_2_cuda", flash_attn_2_path)
+        flash_attn_2_cuda_module = importlib.util.module_from_spec(spec_cuda)
+        sys.modules["flash_attn_2_cuda"] = flash_attn_2_cuda_module
+        spec_cuda.loader.exec_module(flash_attn_2_cuda_module)
+    print("Flash-attention mock modules set up successfully")
 # Create a function to load the model
 def load_model():
     try:
         print("\nLoading InternVL2-8B model...")
+        # Set up proper mock modules for flash_attn
+        setup_flash_attn_mock()
+        # Disable flash attention in transformers by patching environment vars
+        os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
+        os.environ["TRANSFORMERS_OFFLINE"] = "1"  # Avoid online checks for flash_attn
         # Load the model and tokenizer
         model_path = "OpenGVLab/InternVL2-8B"
+        print("Loading tokenizer...")
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        print("Loading model (this may take a while)...")
+        # Add specific flags to avoid flash_attn usage
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
             torch_dtype=torch.bfloat16,
             device_map="auto",
+            trust_remote_code=True,
+            use_flash_attention_2=False,  # Explicitly disable flash attention
+            attn_implementation="eager"    # Use eager implementation instead
         )
         # Define generation config

Properly mock flash_attn module with __spec__ attribute

Properly mock flash_attn module with spec attribute