Spaces:

C10X
/

Dataset-Quality-Scorer

Running

App Files Files Community

C10X commited on 18 days ago

Commit

7c858f7

verified ·

1 Parent(s): 1605a37

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -37

app.py CHANGED Viewed

@@ -20,8 +20,7 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 from datetime import datetime
 import warnings
-from huggingface_hub import HfApi, create_repo, upload_file, snapshot_download, whoami
-from gradio_huggingfacehub_search import HuggingfaceHubSearch
 from pathlib import Path
 from textwrap import dedent
 from scipy import stats
@@ -71,13 +70,17 @@ DESCRIPTION_MD = """
 """
 # --- Helper Functions ---
 def escape(s: str) -> str:
     """Escape special characters for safe HTML display."""
     s = str(s)
-    s = s.replace("&", "&")
-    s = s.replace("<", "<")
-    s = s.replace(">", ">")
-    s = s.replace('"', """)
     s = s.replace("\n", "<br/>")
     return s
@@ -91,50 +94,68 @@ def fasttext_preprocess(content: str, tokenizer) -> str:
     return re.sub(r' +', ' ', content).strip()
 def fasttext_infer(norm_content: str, model) -> Tuple[str, float]:
-    """Run inference and correctly calculate the quality score."""
     try:
-        pred_label_arr, pred_prob_arr = model.predict(norm_content)
-        pred_label = pred_label_arr[0]
-        score = float(pred_prob_arr[0])
-        # The score represents confidence in the predicted label.
-        # If the prediction is negative, the quality score is 1 - confidence.
-        if pred_label == "__label__neg":
-            score = 1 - score
-        return pred_label, max(0.0, min(1.0, score))
     except Exception as e:
         print(f"Error in fasttext_infer: {e}")
         return "__label__neg", 0.0
 def load_models():
-    """Load models into global variables, with correct paths."""
     global MODEL_LOADED, fasttext_model, tokenizer
-    if MODEL_LOADED:
-        return True
     try:
         model_dir = MODEL_CACHE_DIR / "Ultra-FineWeb-classifier"
         if not model_dir.exists():
-            print("Downloading model files...")
             snapshot_download(repo_id="openbmb/Ultra-FineWeb-classifier", local_dir=str(model_dir), local_dir_use_symlinks=False)
-        # --- FIXED: Use correct paths from the downloaded repository ---
-        tokenizer_path = model_dir / "local_tokenizer"
-        fasttext_path = model_dir / "classifiers" / "ultra_fineweb_en.bin"
-        print("Loading tokenizer and model...")
-        tokenizer = LlamaTokenizerFast.from_pretrained(str(tokenizer_path))
-        fasttext_model = fasttext.load_model(str(fasttext_path))
         MODEL_LOADED = True
-        print("Models loaded successfully.")
-        return True
     except Exception as e:
         print(f"Error loading models: {e}")
-        gr.Warning(f"Failed to load models: {e}")
-        return False
 def create_quality_plot(scores: List[float], dataset_name: str) -> str:
     with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
@@ -278,11 +299,10 @@ def create_demo():
         with gr.Row():
             with gr.Column(scale=3):
                 gr.Markdown("### 1. Configure Dataset")
-                # --- IMPROVEMENT: Restored the more user-friendly search component ---
-                dataset_search = HuggingfaceHubSearch(
                     label="Hugging Face Dataset ID",
-                    search_type="dataset",
-                    value="roneneldan/TinyStories"
                 )
                 text_column = gr.Textbox(label="Text Column Name", value="text")
             with gr.Column(scale=2):
@@ -326,15 +346,15 @@ def create_demo():
         process_btn.click(
             fn=process_dataset,
-            inputs=[dataset_search, dataset_split, text_column, sample_size, batch_size],
             outputs=outputs_list
         )
         clear_btn.click(
             fn=clear_form,
             outputs=[
-                dataset_search, dataset_split, text_column, sample_size, batch_size,
-                live_log, summary_output, scored_file_output, stats_file_output, plot_output,
                 results_group, upload_group, upload_status
             ]
         )

 import seaborn as sns
 from datetime import datetime
 import warnings
+from huggingface_hub import HfApi, create_repo, upload_file, snapshot_download, whoami, HfFolder
 from pathlib import Path
 from textwrap import dedent
 from scipy import stats
 """
 # --- Helper Functions ---
+# ==============================================================================
+# --- HATAYI GİDEREN KESİN VE NİHAİ DÜZELTME BURADA ---
+# `escape` fonksiyonu, olması gereken doğru haline geri getirildi.
+# ==============================================================================
 def escape(s: str) -> str:
     """Escape special characters for safe HTML display."""
     s = str(s)
+    s = s.replace("&", "&amp;")
+    s = s.replace("<", "&lt;")
+    s = s.replace(">", "&gt;")
+    s = s.replace('"', "&quot;")
     s = s.replace("\n", "<br/>")
     return s
     return re.sub(r' +', ' ', content).strip()
 def fasttext_infer(norm_content: str, model) -> Tuple[str, float]:
+    """Run inference using the FastText model.
+    Args:
+        norm_content: Normalized text content to score
+        model: Loaded FastText model
+    Returns:
+        Tuple of (predicted_label, score) where score is between 0 and 1
+    """
     try:
+        # Get prediction from model
+        pred_label, pred_prob = model.predict(norm_content)
+        # Handle different label formats
+        if isinstance(pred_label, (list, np.ndarray)) and len(pred_label) > 0:
+            pred_label = pred_label[0]
+        # Default score if we can't process it
+        score = 0.5
+        # Handle different probability formats
+        if pred_prob is not None:
+            # If it's a numpy array, convert to list
+            if hasattr(pred_prob, 'tolist'):
+                pred_prob = pred_prob.tolist()
+            # Handle list/array formats
+            if isinstance(pred_prob, (list, np.ndarray)) and len(pred_prob) > 0:
+                # Get first element if it's a nested structure
+                first_prob = pred_prob[0] if not isinstance(pred_prob[0], (list, np.ndarray)) else pred_prob[0][0]
+                score = float(first_prob)
+            else:
+                # Try direct conversion if it's a single value
+                score = float(pred_prob)
+        # Ensure score is between 0 and 1
+        score = max(0.0, min(1.0, score))
+        return pred_label, score
     except Exception as e:
         print(f"Error in fasttext_infer: {e}")
         return "__label__neg", 0.0
 def load_models():
     global MODEL_LOADED, fasttext_model, tokenizer
+    if MODEL_LOADED and tokenizer is not None and fasttext_model is not None:
+        return tokenizer, fasttext_model
     try:
         model_dir = MODEL_CACHE_DIR / "Ultra-FineWeb-classifier"
         if not model_dir.exists():
             snapshot_download(repo_id="openbmb/Ultra-FineWeb-classifier", local_dir=str(model_dir), local_dir_use_symlinks=False)
+        # Load tokenizer and model
+        tokenizer = LlamaTokenizerFast.from_pretrained(str(model_dir / "tokenizer"))
+        fasttext_model = fasttext.load_model(str(model_dir / "classifier.bin"))
         MODEL_LOADED = True
+        return tokenizer, fasttext_model
     except Exception as e:
         print(f"Error loading models: {e}")
+        return None, None
 def create_quality_plot(scores: List[float], dataset_name: str) -> str:
     with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
         with gr.Row():
             with gr.Column(scale=3):
                 gr.Markdown("### 1. Configure Dataset")
+                dataset_id = gr.Textbox(
                     label="Hugging Face Dataset ID",
+                    value="roneneldan/TinyStories",
+                    placeholder="username/dataset_name"
                 )
                 text_column = gr.Textbox(label="Text Column Name", value="text")
             with gr.Column(scale=2):
         process_btn.click(
             fn=process_dataset,
+            inputs=[dataset_id, dataset_split, text_column, sample_size, batch_size],
             outputs=outputs_list
         )
         clear_btn.click(
             fn=clear_form,
             outputs=[
+                dataset_id, dataset_split, text_column, sample_size, batch_size, live_log,
+                summary_output, scored_file_output, stats_file_output, plot_output,
                 results_group, upload_group, upload_status
             ]
         )