Spaces:

C10X
/

Dataset-Quality-Scorer

Running

App Files Files Community

C10X commited on 25 days ago

Commit

4d87991

verified ·

1 Parent(s): b41824a

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -26

app.py CHANGED Viewed

@@ -33,10 +33,11 @@ warnings.filterwarnings('ignore')
 HF_TOKEN = os.environ.get("HF_TOKEN")
 # Global variables for model caching
-MODEL_CACHE_DIR = Path.home() / ".cache" / "ultra_fineweb"
-MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
 fasttext_model = None
 tokenizer = None
 # CSS
 css = """
@@ -51,26 +52,29 @@ css = """
 }
 """
-# HTML templates
-TITLE = """
-<div style="text-align: center; margin-bottom: 30px;">
-    <h1 style="font-size: 36px; margin-bottom: 10px;">Create your own Dataset Quality Scores, blazingly fast ⚡!</h1>
-    <p style="font-size: 16px; color: #666;">The space takes a HF dataset as input, scores it and provides statistics and quality distribution.</p>
-</div>
-"""
-DESCRIPTION_MD = """
-### 📋 How it works:
-1.  Choose a dataset from Hugging Face Hub.
-2.  The Ultra-FineWeb classifier will score each text sample.
-3.  View quality distribution and download the scored dataset.
-4.  Optionally, upload the results to a new repository on your Hugging Face account.
-**Note:** The first run will download the model (~347MB), which may take a moment.
-"""
 # --- Helper Functions ---
 def escape(s: str) -> str:
     s = str(s)
     s = s.replace("&", "&")
     s = s.replace("<", "<")
@@ -102,15 +106,10 @@ def fasttext_infer(norm_content: str, model) -> Tuple[str, float]:
         print(f"Error in fasttext_infer: {e}")
         return "__label__neg", 0.0
-# ==============================================================================
-# --- HATAYI GİDEREN KESİN VE NİHAİ DÜZELTME BURADA ---
-# `MODEL_LOADED` bayrağı kaldırıldı. Artık doğrudan değişkenlerin dolu olup olmadığı kontrol ediliyor.
-# ==============================================================================
 def load_models():
-    """Load models into global variables, returning True on success, False on failure."""
     global fasttext_model, tokenizer
-    # Bayrak yerine doğrudan değişkenleri kontrol et. Bu en güvenli yöntemdir.
     if tokenizer is not None and fasttext_model is not None:
         return True
@@ -131,7 +130,6 @@ def load_models():
         return True
     except Exception as e:
         print(f"Error loading models: {e}")
-        # Hata durumunda değişkenleri tekrar None yap ki bir sonraki sefer yeniden yüklensin.
         tokenizer = None
         fasttext_model = None
         gr.Warning(f"Failed to load models: {e}")
@@ -170,7 +168,6 @@ def process_dataset(
     try:
         yield update_log("Starting process...")
         yield update_log("Loading scoring models...")
-        # Düzeltilmiş ve artık doğru çalışan kontrol mekanizması
         if not load_models():
             raise gr.Error("Failed to load scoring models. Please check logs.")
         yield update_log("Models loaded successfully.")

 HF_TOKEN = os.environ.get("HF_TOKEN")
 # Global variables for model caching
 fasttext_model = None
 tokenizer = None
+MODEL_CACHE_DIR = Path.home() / ".cache" / "ultra_fineweb"
+MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
 # CSS
 css = """
 }
 """
+# ==============================================================================
+# --- HATAYI GİDEREN KESİN VE NİHAİ DÜZELTME BURADA ---
+# Tüm çok satırlı metin blokları (""") kaldırıldı ve standart string'lere dönüştürüldü.
+# ==============================================================================
+TITLE = (
+    '<div style="text-align: center; margin-bottom: 30px;">'
+    '<h1 style="font-size: 36px; margin-bottom: 10px;">Create your own Dataset Quality Scores, blazingly fast ⚡!</h1>'
+    '<p style="font-size: 16px; color: #666;">The space takes a HF dataset as input, scores it and provides statistics and quality distribution.</p>'
+    '</div>'
+)
+DESCRIPTION_MD = (
+    "### 📋 How it works:\n"
+    "1.  Choose a dataset from Hugging Face Hub.\n"
+    "2.  The Ultra-FineWeb classifier will score each text sample.\n"
+    "3.  View quality distribution and download the scored dataset.\n"
+    "4.  Optionally, upload the results to a new repository on your Hugging Face account.\n\n"
+    "**Note:** The first run will download the model (~347MB), which may take a moment."
+)
 # --- Helper Functions ---
 def escape(s: str) -> str:
+    # Escape special characters for safe HTML display.
     s = str(s)
     s = s.replace("&", "&")
     s = s.replace("<", "<")
         print(f"Error in fasttext_infer: {e}")
         return "__label__neg", 0.0
 def load_models():
+    # Load models into global variables, returning True on success, False on failure.
     global fasttext_model, tokenizer
     if tokenizer is not None and fasttext_model is not None:
         return True
         return True
     except Exception as e:
         print(f"Error loading models: {e}")
         tokenizer = None
         fasttext_model = None
         gr.Warning(f"Failed to load models: {e}")
     try:
         yield update_log("Starting process...")
         yield update_log("Loading scoring models...")
         if not load_models():
             raise gr.Error("Failed to load scoring models. Please check logs.")
         yield update_log("Models loaded successfully.")