Spaces:

MoraxCheng
/

Transeption_iGEM_BASISCHINA_2025

Running on Zero

App Files Files Community

MoraxCheng commited on Jun 25

Commit

3c96d15

1 Parent(s): 634f0ae

Enhance URL handling in transformers with comprehensive validation and retry mechanism; implement cache file validation and cleaning process

Browse files

Files changed (1) hide show

app.py +108 -22

app.py CHANGED Viewed

@@ -16,22 +16,59 @@ os.environ['TRANSFORMERS_OFFLINE'] = '0'
 # Patch for transformers 4.17.0 URL issue in HF Spaces
 import urllib.parse
 def patch_transformers_url():
-    """Fix URL scheme issue in transformers 4.17.0"""
     try:
         import transformers.file_utils
         original_get_from_cache = transformers.file_utils.get_from_cache
         def patched_get_from_cache(url, *args, **kwargs):
-            # More robust URL fixing
-            if isinstance(url, str) and url.startswith('/api/'):
-                # Use urljoin for safer URL construction
-                url = urllib.parse.urljoin('https://huggingface.co', url)
-            return original_get_from_cache(url, *args, **kwargs)
         transformers.file_utils.get_from_cache = patched_get_from_cache
-        print("Applied URL patch for transformers")
     except Exception as e:
         print(f"Warning: Could not patch transformers URL handling: {e}")
@@ -106,6 +143,56 @@ from tranception import config, model_pytorch
 # Model loading configuration
 MODEL_CACHE = {}
 def get_model_path(model_name):
     """Get model path - always use HF Hub for Zero GPU spaces"""
     # In HF Spaces, models are cached automatically by the transformers library
@@ -187,26 +274,20 @@ def load_model_cached(model_type):
     model_path = get_model_path(model_name)
     try:
-        # Clear any corrupted cache files
         import shutil
         cache_dir = "/tmp/huggingface/transformers"
-        if os.path.exists(cache_dir):
-            # Remove corrupted tranception cache files
-            for file in os.listdir(cache_dir):
-                if "tranception" in file.lower():
-                    try:
-                        filepath = os.path.join(cache_dir, file)
-                        if os.path.isfile(filepath) and os.path.getsize(filepath) < 1000:
-                            os.remove(filepath)
-                            print(f"Removed corrupted cache file: {file}")
-                    except:
-                        pass
         os.makedirs(cache_dir, exist_ok=True)
-        # Try loading with force_download to avoid corrupted cache
-        # Use HF_ENDPOINT environment variable to ensure proper URL
         os.environ["HF_ENDPOINT"] = "https://huggingface.co"
         model = tranception.model_pytorch.TranceptionLMHeadModel.from_pretrained(
             model_path,
@@ -220,6 +301,11 @@ def load_model_cached(model_type):
         return model
     except Exception as e:
         print(f"Error loading {model_type} model: {e}")
         print(f"Attempting alternative loading method...")
         # Try alternative loading approach with full URL

 # Patch for transformers 4.17.0 URL issue in HF Spaces
 import urllib.parse
+import json
+import time
 def patch_transformers_url():
+    """Fix URL scheme issue in transformers 4.17.0 with comprehensive URL handling"""
     try:
         import transformers.file_utils
         original_get_from_cache = transformers.file_utils.get_from_cache
         def patched_get_from_cache(url, *args, **kwargs):
+            # Comprehensive URL fixing for various formats
+            if isinstance(url, str):
+                # Handle different types of malformed URLs
+                if url.startswith('/api/'):
+                    # Fix relative API URLs - ensure proper base URL
+                    url = 'https://huggingface.co' + url
+                elif url.startswith('//'):
+                    # Fix protocol-relative URLs
+                    url = 'https:' + url
+                elif not url.startswith(('http://', 'https://')):
+                    # Handle other relative paths
+                    if url.startswith('/'):
+                        url = 'https://huggingface.co' + url
+                    else:
+                        url = 'https://huggingface.co/' + url
+                # Additional validation and normalization
+                try:
+                    parsed = urllib.parse.urlparse(url)
+                    if not parsed.netloc:
+                        # If no netloc found, construct proper URL
+                        url = 'https://huggingface.co' + ('/' + url if not url.startswith('/') else url)
+                except Exception:
+                    # Fallback for URL parsing errors
+                    if not url.startswith('https://'):
+                        url = 'https://huggingface.co' + ('/' + url if not url.startswith('/') else url)
+            # Add retry mechanism for network requests
+            max_retries = 3
+            for attempt in range(max_retries):
+                try:
+                    return original_get_from_cache(url, *args, **kwargs)
+                except Exception as e:
+                    if attempt < max_retries - 1:
+                        print(f"Download attempt {attempt + 1} failed for {url}: {e}. Retrying...")
+                        time.sleep(2 ** attempt)  # Exponential backoff
+                        continue
+                    else:
+                        print(f"All download attempts failed for {url}: {e}")
+                        raise
         transformers.file_utils.get_from_cache = patched_get_from_cache
+        print("Applied enhanced URL patch for transformers")
     except Exception as e:
         print(f"Warning: Could not patch transformers URL handling: {e}")
 # Model loading configuration
 MODEL_CACHE = {}
+def validate_cache_file(file_path, min_size=1000):
+    """Validate cache file integrity and content"""
+    if not os.path.exists(file_path):
+        return False, "File does not exist"
+    # Check file size
+    try:
+        file_size = os.path.getsize(file_path)
+        if file_size < min_size:
+            return False, f"File too small ({file_size} bytes < {min_size})"
+    except Exception as e:
+        return False, f"Cannot get file size: {e}"
+    # Check if it's supposed to be a JSON file (config files)
+    if file_path.endswith('.json') or 'config' in file_path.lower():
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read().strip()
+                if not content:
+                    return False, "Empty JSON file"
+                json.loads(content)  # Validate JSON syntax
+            return True, "Valid JSON file"
+        except json.JSONDecodeError:
+            return False, "Invalid JSON content"
+        except Exception as e:
+            return False, f"Cannot read JSON file: {e}"
+    return True, "File appears valid"
+def clean_corrupted_cache_files(cache_dir):
+    """Clean corrupted or invalid cache files"""
+    if not os.path.exists(cache_dir):
+        return
+    cleaned_count = 0
+    for file in os.listdir(cache_dir):
+        filepath = os.path.join(cache_dir, file)
+        if os.path.isfile(filepath):
+            valid, reason = validate_cache_file(filepath)
+            if not valid:
+                try:
+                    os.remove(filepath)
+                    print(f"Removed corrupted cache file: {file} ({reason})")
+                    cleaned_count += 1
+                except Exception as e:
+                    print(f"Could not remove {file}: {e}")
+    if cleaned_count > 0:
+        print(f"Cleaned {cleaned_count} corrupted cache files")
 def get_model_path(model_name):
     """Get model path - always use HF Hub for Zero GPU spaces"""
     # In HF Spaces, models are cached automatically by the transformers library
     model_path = get_model_path(model_name)
     try:
+        # Enhanced cache cleaning with validation
         import shutil
         cache_dir = "/tmp/huggingface/transformers"
         os.makedirs(cache_dir, exist_ok=True)
+        # Clean corrupted cache files using the new validation system
+        print("Validating and cleaning cache files...")
+        clean_corrupted_cache_files(cache_dir)
+        # Enhanced environment setup for robust model loading
         os.environ["HF_ENDPOINT"] = "https://huggingface.co"
+        os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
+        os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
+        os.environ["HF_HUB_DISABLE_EXPERIMENTAL_WARNING"] = "1"
         model = tranception.model_pytorch.TranceptionLMHeadModel.from_pretrained(
             model_path,
         return model
     except Exception as e:
         print(f"Error loading {model_type} model: {e}")
+        print(f"Error type: {type(e).__name__}")
+        if hasattr(e, '__cause__') and e.__cause__:
+            print(f"Root cause: {e.__cause__}")
+        print(f"Model path used: {model_path}")
+        print(f"Cache directory: {cache_dir}")
         print(f"Attempting alternative loading method...")
         # Try alternative loading approach with full URL