Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -33,10 +33,11 @@ warnings.filterwarnings('ignore')
|
|
33 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
34 |
|
35 |
# Global variables for model caching
|
36 |
-
MODEL_CACHE_DIR = Path.home() / ".cache" / "ultra_fineweb"
|
37 |
-
MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
38 |
fasttext_model = None
|
39 |
tokenizer = None
|
|
|
|
|
|
|
40 |
|
41 |
# CSS
|
42 |
css = """
|
@@ -51,26 +52,29 @@ css = """
|
|
51 |
}
|
52 |
"""
|
53 |
|
54 |
-
#
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
""
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
1. Choose a dataset from Hugging Face Hub.
|
65 |
-
2. The Ultra-FineWeb classifier will score each text sample.
|
66 |
-
3. View quality distribution and download the scored dataset.
|
67 |
-
4. Optionally, upload the results to a new repository on your Hugging Face account.
|
68 |
|
69 |
-
|
70 |
-
""
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
# --- Helper Functions ---
|
73 |
def escape(s: str) -> str:
|
|
|
74 |
s = str(s)
|
75 |
s = s.replace("&", "&")
|
76 |
s = s.replace("<", "<")
|
@@ -102,15 +106,10 @@ def fasttext_infer(norm_content: str, model) -> Tuple[str, float]:
|
|
102 |
print(f"Error in fasttext_infer: {e}")
|
103 |
return "__label__neg", 0.0
|
104 |
|
105 |
-
# ==============================================================================
|
106 |
-
# --- HATAYI GİDEREN KESİN VE NİHAİ DÜZELTME BURADA ---
|
107 |
-
# `MODEL_LOADED` bayrağı kaldırıldı. Artık doğrudan değişkenlerin dolu olup olmadığı kontrol ediliyor.
|
108 |
-
# ==============================================================================
|
109 |
def load_models():
|
110 |
-
|
111 |
global fasttext_model, tokenizer
|
112 |
|
113 |
-
# Bayrak yerine doğrudan değişkenleri kontrol et. Bu en güvenli yöntemdir.
|
114 |
if tokenizer is not None and fasttext_model is not None:
|
115 |
return True
|
116 |
|
@@ -131,7 +130,6 @@ def load_models():
|
|
131 |
return True
|
132 |
except Exception as e:
|
133 |
print(f"Error loading models: {e}")
|
134 |
-
# Hata durumunda değişkenleri tekrar None yap ki bir sonraki sefer yeniden yüklensin.
|
135 |
tokenizer = None
|
136 |
fasttext_model = None
|
137 |
gr.Warning(f"Failed to load models: {e}")
|
@@ -170,7 +168,6 @@ def process_dataset(
|
|
170 |
try:
|
171 |
yield update_log("Starting process...")
|
172 |
yield update_log("Loading scoring models...")
|
173 |
-
# Düzeltilmiş ve artık doğru çalışan kontrol mekanizması
|
174 |
if not load_models():
|
175 |
raise gr.Error("Failed to load scoring models. Please check logs.")
|
176 |
yield update_log("Models loaded successfully.")
|
|
|
33 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
34 |
|
35 |
# Global variables for model caching
|
|
|
|
|
36 |
fasttext_model = None
|
37 |
tokenizer = None
|
38 |
+
MODEL_CACHE_DIR = Path.home() / ".cache" / "ultra_fineweb"
|
39 |
+
MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
40 |
+
|
41 |
|
42 |
# CSS
|
43 |
css = """
|
|
|
52 |
}
|
53 |
"""
|
54 |
|
55 |
+
# ==============================================================================
|
56 |
+
# --- HATAYI GİDEREN KESİN VE NİHAİ DÜZELTME BURADA ---
|
57 |
+
# Tüm çok satırlı metin blokları (""") kaldırıldı ve standart string'lere dönüştürüldü.
|
58 |
+
# ==============================================================================
|
59 |
+
TITLE = (
|
60 |
+
'<div style="text-align: center; margin-bottom: 30px;">'
|
61 |
+
'<h1 style="font-size: 36px; margin-bottom: 10px;">Create your own Dataset Quality Scores, blazingly fast ⚡!</h1>'
|
62 |
+
'<p style="font-size: 16px; color: #666;">The space takes a HF dataset as input, scores it and provides statistics and quality distribution.</p>'
|
63 |
+
'</div>'
|
64 |
+
)
|
|
|
|
|
|
|
|
|
65 |
|
66 |
+
DESCRIPTION_MD = (
|
67 |
+
"### 📋 How it works:\n"
|
68 |
+
"1. Choose a dataset from Hugging Face Hub.\n"
|
69 |
+
"2. The Ultra-FineWeb classifier will score each text sample.\n"
|
70 |
+
"3. View quality distribution and download the scored dataset.\n"
|
71 |
+
"4. Optionally, upload the results to a new repository on your Hugging Face account.\n\n"
|
72 |
+
"**Note:** The first run will download the model (~347MB), which may take a moment."
|
73 |
+
)
|
74 |
|
75 |
# --- Helper Functions ---
|
76 |
def escape(s: str) -> str:
|
77 |
+
# Escape special characters for safe HTML display.
|
78 |
s = str(s)
|
79 |
s = s.replace("&", "&")
|
80 |
s = s.replace("<", "<")
|
|
|
106 |
print(f"Error in fasttext_infer: {e}")
|
107 |
return "__label__neg", 0.0
|
108 |
|
|
|
|
|
|
|
|
|
109 |
def load_models():
|
110 |
+
# Load models into global variables, returning True on success, False on failure.
|
111 |
global fasttext_model, tokenizer
|
112 |
|
|
|
113 |
if tokenizer is not None and fasttext_model is not None:
|
114 |
return True
|
115 |
|
|
|
130 |
return True
|
131 |
except Exception as e:
|
132 |
print(f"Error loading models: {e}")
|
|
|
133 |
tokenizer = None
|
134 |
fasttext_model = None
|
135 |
gr.Warning(f"Failed to load models: {e}")
|
|
|
168 |
try:
|
169 |
yield update_log("Starting process...")
|
170 |
yield update_log("Loading scoring models...")
|
|
|
171 |
if not load_models():
|
172 |
raise gr.Error("Failed to load scoring models. Please check logs.")
|
173 |
yield update_log("Models loaded successfully.")
|