Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -20,7 +20,8 @@ import matplotlib.pyplot as plt
|
|
20 |
import seaborn as sns
|
21 |
from datetime import datetime
|
22 |
import warnings
|
23 |
-
from huggingface_hub import HfApi, create_repo, upload_file, snapshot_download, whoami
|
|
|
24 |
from pathlib import Path
|
25 |
from textwrap import dedent
|
26 |
from scipy import stats
|
@@ -70,17 +71,12 @@ DESCRIPTION_MD = """
|
|
70 |
"""
|
71 |
|
72 |
# --- Helper Functions ---
|
73 |
-
# ==============================================================================
|
74 |
-
# --- HATAYI GİDEREN KESİN VE NİHAİ DÜZELTME BURADA ---
|
75 |
-
# `escape` fonksiyonu, olması gereken doğru haline geri getirildi.
|
76 |
-
# ==============================================================================
|
77 |
def escape(s: str) -> str:
|
78 |
-
"""Escape special characters for safe HTML display."""
|
79 |
s = str(s)
|
80 |
-
s = s.replace("&", "&
|
81 |
-
s = s.replace("<", "
|
82 |
-
s = s.replace(">", "
|
83 |
-
s = s.replace('"', "
|
84 |
s = s.replace("\n", "<br/>")
|
85 |
return s
|
86 |
|
@@ -94,68 +90,49 @@ def fasttext_preprocess(content: str, tokenizer) -> str:
|
|
94 |
return re.sub(r' +', ' ', content).strip()
|
95 |
|
96 |
def fasttext_infer(norm_content: str, model) -> Tuple[str, float]:
|
97 |
-
"""Run inference using the FastText model.
|
98 |
-
|
99 |
-
Args:
|
100 |
-
norm_content: Normalized text content to score
|
101 |
-
model: Loaded FastText model
|
102 |
-
|
103 |
-
Returns:
|
104 |
-
Tuple of (predicted_label, score) where score is between 0 and 1
|
105 |
-
"""
|
106 |
try:
|
107 |
-
|
108 |
-
pred_label
|
109 |
-
|
110 |
-
# Handle different label formats
|
111 |
-
if isinstance(pred_label, (list, np.ndarray)) and len(pred_label) > 0:
|
112 |
-
pred_label = pred_label[0]
|
113 |
-
|
114 |
-
# Default score if we can't process it
|
115 |
-
score = 0.5
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
# If it's a numpy array, convert to list
|
120 |
-
if hasattr(pred_prob, 'tolist'):
|
121 |
-
pred_prob = pred_prob.tolist()
|
122 |
|
123 |
-
|
124 |
-
if isinstance(pred_prob, (list, np.ndarray)) and len(pred_prob) > 0:
|
125 |
-
# Get first element if it's a nested structure
|
126 |
-
first_prob = pred_prob[0] if not isinstance(pred_prob[0], (list, np.ndarray)) else pred_prob[0][0]
|
127 |
-
score = float(first_prob)
|
128 |
-
else:
|
129 |
-
# Try direct conversion if it's a single value
|
130 |
-
score = float(pred_prob)
|
131 |
-
|
132 |
-
# Ensure score is between 0 and 1
|
133 |
-
score = max(0.0, min(1.0, score))
|
134 |
-
return pred_label, score
|
135 |
-
|
136 |
except Exception as e:
|
137 |
print(f"Error in fasttext_infer: {e}")
|
138 |
return "__label__neg", 0.0
|
139 |
|
|
|
|
|
|
|
|
|
140 |
def load_models():
|
|
|
141 |
global MODEL_LOADED, fasttext_model, tokenizer
|
142 |
-
if MODEL_LOADED
|
143 |
-
return
|
144 |
|
145 |
try:
|
146 |
model_dir = MODEL_CACHE_DIR / "Ultra-FineWeb-classifier"
|
147 |
if not model_dir.exists():
|
|
|
148 |
snapshot_download(repo_id="openbmb/Ultra-FineWeb-classifier", local_dir=str(model_dir), local_dir_use_symlinks=False)
|
149 |
|
150 |
-
|
151 |
-
|
152 |
-
|
|
|
|
|
|
|
153 |
|
154 |
MODEL_LOADED = True
|
155 |
-
|
|
|
156 |
except Exception as e:
|
157 |
print(f"Error loading models: {e}")
|
158 |
-
|
|
|
159 |
|
160 |
def create_quality_plot(scores: List[float], dataset_name: str) -> str:
|
161 |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
|
@@ -190,6 +167,7 @@ def process_dataset(
|
|
190 |
try:
|
191 |
yield update_log("Starting process...")
|
192 |
yield update_log("Loading scoring models...")
|
|
|
193 |
if not load_models():
|
194 |
raise gr.Error("Failed to load scoring models. Please check logs.")
|
195 |
yield update_log("Models loaded successfully.")
|
@@ -299,10 +277,10 @@ def create_demo():
|
|
299 |
with gr.Row():
|
300 |
with gr.Column(scale=3):
|
301 |
gr.Markdown("### 1. Configure Dataset")
|
302 |
-
|
303 |
label="Hugging Face Dataset ID",
|
304 |
-
|
305 |
-
|
306 |
)
|
307 |
text_column = gr.Textbox(label="Text Column Name", value="text")
|
308 |
with gr.Column(scale=2):
|
@@ -346,15 +324,15 @@ def create_demo():
|
|
346 |
|
347 |
process_btn.click(
|
348 |
fn=process_dataset,
|
349 |
-
inputs=[
|
350 |
outputs=outputs_list
|
351 |
)
|
352 |
|
353 |
clear_btn.click(
|
354 |
fn=clear_form,
|
355 |
outputs=[
|
356 |
-
|
357 |
-
summary_output, scored_file_output, stats_file_output, plot_output,
|
358 |
results_group, upload_group, upload_status
|
359 |
]
|
360 |
)
|
|
|
20 |
import seaborn as sns
|
21 |
from datetime import datetime
|
22 |
import warnings
|
23 |
+
from huggingface_hub import HfApi, create_repo, upload_file, snapshot_download, whoami
|
24 |
+
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
25 |
from pathlib import Path
|
26 |
from textwrap import dedent
|
27 |
from scipy import stats
|
|
|
71 |
"""
|
72 |
|
73 |
# --- Helper Functions ---
|
|
|
|
|
|
|
|
|
74 |
def escape(s: str) -> str:
|
|
|
75 |
s = str(s)
|
76 |
+
s = s.replace("&", "&")
|
77 |
+
s = s.replace("<", "<")
|
78 |
+
s = s.replace(">", ">")
|
79 |
+
s = s.replace('"', """)
|
80 |
s = s.replace("\n", "<br/>")
|
81 |
return s
|
82 |
|
|
|
90 |
return re.sub(r' +', ' ', content).strip()
|
91 |
|
92 |
def fasttext_infer(norm_content: str, model) -> Tuple[str, float]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
try:
|
94 |
+
pred_label_arr, pred_prob_arr = model.predict(norm_content)
|
95 |
+
pred_label = pred_label_arr[0]
|
96 |
+
score = float(pred_prob_arr[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
+
if pred_label == "__label__neg":
|
99 |
+
score = 1 - score
|
|
|
|
|
|
|
100 |
|
101 |
+
return pred_label, max(0.0, min(1.0, score))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
except Exception as e:
|
103 |
print(f"Error in fasttext_infer: {e}")
|
104 |
return "__label__neg", 0.0
|
105 |
|
106 |
+
# ==============================================================================
|
107 |
+
# --- HATAYI GİDEREN KESİN VE NİHAİ DÜZELTME BURADA ---
|
108 |
+
# load_models artık sadece True veya False döndürerek kontrolü garantiliyor.
|
109 |
+
# ==============================================================================
|
110 |
def load_models():
|
111 |
+
"""Load models into global variables, returning True on success, False on failure."""
|
112 |
global MODEL_LOADED, fasttext_model, tokenizer
|
113 |
+
if MODEL_LOADED:
|
114 |
+
return True
|
115 |
|
116 |
try:
|
117 |
model_dir = MODEL_CACHE_DIR / "Ultra-FineWeb-classifier"
|
118 |
if not model_dir.exists():
|
119 |
+
print("Downloading model files...")
|
120 |
snapshot_download(repo_id="openbmb/Ultra-FineWeb-classifier", local_dir=str(model_dir), local_dir_use_symlinks=False)
|
121 |
|
122 |
+
tokenizer_path = model_dir / "local_tokenizer"
|
123 |
+
fasttext_path = model_dir / "classifiers" / "ultra_fineweb_en.bin"
|
124 |
+
|
125 |
+
print("Loading tokenizer and model...")
|
126 |
+
tokenizer = LlamaTokenizerFast.from_pretrained(str(tokenizer_path))
|
127 |
+
fasttext_model = fasttext.load_model(str(fasttext_path))
|
128 |
|
129 |
MODEL_LOADED = True
|
130 |
+
print("Models loaded successfully.")
|
131 |
+
return True
|
132 |
except Exception as e:
|
133 |
print(f"Error loading models: {e}")
|
134 |
+
gr.Warning(f"Failed to load models: {e}")
|
135 |
+
return False
|
136 |
|
137 |
def create_quality_plot(scores: List[float], dataset_name: str) -> str:
|
138 |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
|
|
|
167 |
try:
|
168 |
yield update_log("Starting process...")
|
169 |
yield update_log("Loading scoring models...")
|
170 |
+
# Düzeltilmiş kontrol mekanizması
|
171 |
if not load_models():
|
172 |
raise gr.Error("Failed to load scoring models. Please check logs.")
|
173 |
yield update_log("Models loaded successfully.")
|
|
|
277 |
with gr.Row():
|
278 |
with gr.Column(scale=3):
|
279 |
gr.Markdown("### 1. Configure Dataset")
|
280 |
+
dataset_search = HuggingfaceHubSearch(
|
281 |
label="Hugging Face Dataset ID",
|
282 |
+
search_type="dataset",
|
283 |
+
value="roneneldan/TinyStories"
|
284 |
)
|
285 |
text_column = gr.Textbox(label="Text Column Name", value="text")
|
286 |
with gr.Column(scale=2):
|
|
|
324 |
|
325 |
process_btn.click(
|
326 |
fn=process_dataset,
|
327 |
+
inputs=[dataset_search, dataset_split, text_column, sample_size, batch_size],
|
328 |
outputs=outputs_list
|
329 |
)
|
330 |
|
331 |
clear_btn.click(
|
332 |
fn=clear_form,
|
333 |
outputs=[
|
334 |
+
dataset_search, dataset_split, text_column, sample_size, batch_size,
|
335 |
+
live_log, summary_output, scored_file_output, stats_file_output, plot_output,
|
336 |
results_group, upload_group, upload_status
|
337 |
]
|
338 |
)
|