C10X commited on
Commit
b0e3bb1
·
verified ·
1 Parent(s): 7c858f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -59
app.py CHANGED
@@ -20,7 +20,8 @@ import matplotlib.pyplot as plt
20
  import seaborn as sns
21
  from datetime import datetime
22
  import warnings
23
- from huggingface_hub import HfApi, create_repo, upload_file, snapshot_download, whoami, HfFolder
 
24
  from pathlib import Path
25
  from textwrap import dedent
26
  from scipy import stats
@@ -70,17 +71,12 @@ DESCRIPTION_MD = """
70
  """
71
 
72
  # --- Helper Functions ---
73
- # ==============================================================================
74
- # --- HATAYI GİDEREN KESİN VE NİHAİ DÜZELTME BURADA ---
75
- # `escape` fonksiyonu, olması gereken doğru haline geri getirildi.
76
- # ==============================================================================
77
  def escape(s: str) -> str:
78
- """Escape special characters for safe HTML display."""
79
  s = str(s)
80
- s = s.replace("&", "&")
81
- s = s.replace("<", "&lt;")
82
- s = s.replace(">", "&gt;")
83
- s = s.replace('"', "&quot;")
84
  s = s.replace("\n", "<br/>")
85
  return s
86
 
@@ -94,68 +90,49 @@ def fasttext_preprocess(content: str, tokenizer) -> str:
94
  return re.sub(r' +', ' ', content).strip()
95
 
96
  def fasttext_infer(norm_content: str, model) -> Tuple[str, float]:
97
- """Run inference using the FastText model.
98
-
99
- Args:
100
- norm_content: Normalized text content to score
101
- model: Loaded FastText model
102
-
103
- Returns:
104
- Tuple of (predicted_label, score) where score is between 0 and 1
105
- """
106
  try:
107
- # Get prediction from model
108
- pred_label, pred_prob = model.predict(norm_content)
109
-
110
- # Handle different label formats
111
- if isinstance(pred_label, (list, np.ndarray)) and len(pred_label) > 0:
112
- pred_label = pred_label[0]
113
-
114
- # Default score if we can't process it
115
- score = 0.5
116
 
117
- # Handle different probability formats
118
- if pred_prob is not None:
119
- # If it's a numpy array, convert to list
120
- if hasattr(pred_prob, 'tolist'):
121
- pred_prob = pred_prob.tolist()
122
 
123
- # Handle list/array formats
124
- if isinstance(pred_prob, (list, np.ndarray)) and len(pred_prob) > 0:
125
- # Get first element if it's a nested structure
126
- first_prob = pred_prob[0] if not isinstance(pred_prob[0], (list, np.ndarray)) else pred_prob[0][0]
127
- score = float(first_prob)
128
- else:
129
- # Try direct conversion if it's a single value
130
- score = float(pred_prob)
131
-
132
- # Ensure score is between 0 and 1
133
- score = max(0.0, min(1.0, score))
134
- return pred_label, score
135
-
136
  except Exception as e:
137
  print(f"Error in fasttext_infer: {e}")
138
  return "__label__neg", 0.0
139
 
 
 
 
 
140
  def load_models():
 
141
  global MODEL_LOADED, fasttext_model, tokenizer
142
- if MODEL_LOADED and tokenizer is not None and fasttext_model is not None:
143
- return tokenizer, fasttext_model
144
 
145
  try:
146
  model_dir = MODEL_CACHE_DIR / "Ultra-FineWeb-classifier"
147
  if not model_dir.exists():
 
148
  snapshot_download(repo_id="openbmb/Ultra-FineWeb-classifier", local_dir=str(model_dir), local_dir_use_symlinks=False)
149
 
150
- # Load tokenizer and model
151
- tokenizer = LlamaTokenizerFast.from_pretrained(str(model_dir / "tokenizer"))
152
- fasttext_model = fasttext.load_model(str(model_dir / "classifier.bin"))
 
 
 
153
 
154
  MODEL_LOADED = True
155
- return tokenizer, fasttext_model
 
156
  except Exception as e:
157
  print(f"Error loading models: {e}")
158
- return None, None
 
159
 
160
  def create_quality_plot(scores: List[float], dataset_name: str) -> str:
161
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
@@ -190,6 +167,7 @@ def process_dataset(
190
  try:
191
  yield update_log("Starting process...")
192
  yield update_log("Loading scoring models...")
 
193
  if not load_models():
194
  raise gr.Error("Failed to load scoring models. Please check logs.")
195
  yield update_log("Models loaded successfully.")
@@ -299,10 +277,10 @@ def create_demo():
299
  with gr.Row():
300
  with gr.Column(scale=3):
301
  gr.Markdown("### 1. Configure Dataset")
302
- dataset_id = gr.Textbox(
303
  label="Hugging Face Dataset ID",
304
- value="roneneldan/TinyStories",
305
- placeholder="username/dataset_name"
306
  )
307
  text_column = gr.Textbox(label="Text Column Name", value="text")
308
  with gr.Column(scale=2):
@@ -346,15 +324,15 @@ def create_demo():
346
 
347
  process_btn.click(
348
  fn=process_dataset,
349
- inputs=[dataset_id, dataset_split, text_column, sample_size, batch_size],
350
  outputs=outputs_list
351
  )
352
 
353
  clear_btn.click(
354
  fn=clear_form,
355
  outputs=[
356
- dataset_id, dataset_split, text_column, sample_size, batch_size, live_log,
357
- summary_output, scored_file_output, stats_file_output, plot_output,
358
  results_group, upload_group, upload_status
359
  ]
360
  )
 
20
  import seaborn as sns
21
  from datetime import datetime
22
  import warnings
23
+ from huggingface_hub import HfApi, create_repo, upload_file, snapshot_download, whoami
24
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
25
  from pathlib import Path
26
  from textwrap import dedent
27
  from scipy import stats
 
71
  """
72
 
73
  # --- Helper Functions ---
 
 
 
 
74
  def escape(s: str) -> str:
 
75
  s = str(s)
76
+ s = s.replace("&", "&")
77
+ s = s.replace("<", "<")
78
+ s = s.replace(">", ">")
79
+ s = s.replace('"', """)
80
  s = s.replace("\n", "<br/>")
81
  return s
82
 
 
90
  return re.sub(r' +', ' ', content).strip()
91
 
92
  def fasttext_infer(norm_content: str, model) -> Tuple[str, float]:
 
 
 
 
 
 
 
 
 
93
  try:
94
+ pred_label_arr, pred_prob_arr = model.predict(norm_content)
95
+ pred_label = pred_label_arr[0]
96
+ score = float(pred_prob_arr[0])
 
 
 
 
 
 
97
 
98
+ if pred_label == "__label__neg":
99
+ score = 1 - score
 
 
 
100
 
101
+ return pred_label, max(0.0, min(1.0, score))
 
 
 
 
 
 
 
 
 
 
 
 
102
  except Exception as e:
103
  print(f"Error in fasttext_infer: {e}")
104
  return "__label__neg", 0.0
105
 
106
+ # ==============================================================================
107
+ # --- HATAYI GİDEREN KESİN VE NİHAİ DÜZELTME BURADA ---
108
+ # load_models artık sadece True veya False döndürerek kontrolü garantiliyor.
109
+ # ==============================================================================
110
  def load_models():
111
+ """Load models into global variables, returning True on success, False on failure."""
112
  global MODEL_LOADED, fasttext_model, tokenizer
113
+ if MODEL_LOADED:
114
+ return True
115
 
116
  try:
117
  model_dir = MODEL_CACHE_DIR / "Ultra-FineWeb-classifier"
118
  if not model_dir.exists():
119
+ print("Downloading model files...")
120
  snapshot_download(repo_id="openbmb/Ultra-FineWeb-classifier", local_dir=str(model_dir), local_dir_use_symlinks=False)
121
 
122
+ tokenizer_path = model_dir / "local_tokenizer"
123
+ fasttext_path = model_dir / "classifiers" / "ultra_fineweb_en.bin"
124
+
125
+ print("Loading tokenizer and model...")
126
+ tokenizer = LlamaTokenizerFast.from_pretrained(str(tokenizer_path))
127
+ fasttext_model = fasttext.load_model(str(fasttext_path))
128
 
129
  MODEL_LOADED = True
130
+ print("Models loaded successfully.")
131
+ return True
132
  except Exception as e:
133
  print(f"Error loading models: {e}")
134
+ gr.Warning(f"Failed to load models: {e}")
135
+ return False
136
 
137
  def create_quality_plot(scores: List[float], dataset_name: str) -> str:
138
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
 
167
  try:
168
  yield update_log("Starting process...")
169
  yield update_log("Loading scoring models...")
170
+ # Düzeltilmiş kontrol mekanizması
171
  if not load_models():
172
  raise gr.Error("Failed to load scoring models. Please check logs.")
173
  yield update_log("Models loaded successfully.")
 
277
  with gr.Row():
278
  with gr.Column(scale=3):
279
  gr.Markdown("### 1. Configure Dataset")
280
+ dataset_search = HuggingfaceHubSearch(
281
  label="Hugging Face Dataset ID",
282
+ search_type="dataset",
283
+ value="roneneldan/TinyStories"
284
  )
285
  text_column = gr.Textbox(label="Text Column Name", value="text")
286
  with gr.Column(scale=2):
 
324
 
325
  process_btn.click(
326
  fn=process_dataset,
327
+ inputs=[dataset_search, dataset_split, text_column, sample_size, batch_size],
328
  outputs=outputs_list
329
  )
330
 
331
  clear_btn.click(
332
  fn=clear_form,
333
  outputs=[
334
+ dataset_search, dataset_split, text_column, sample_size, batch_size,
335
+ live_log, summary_output, scored_file_output, stats_file_output, plot_output,
336
  results_group, upload_group, upload_status
337
  ]
338
  )