C10X commited on
Commit
4d87991
·
verified ·
1 Parent(s): b41824a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -26
app.py CHANGED
@@ -33,10 +33,11 @@ warnings.filterwarnings('ignore')
33
  HF_TOKEN = os.environ.get("HF_TOKEN")
34
 
35
  # Global variables for model caching
36
- MODEL_CACHE_DIR = Path.home() / ".cache" / "ultra_fineweb"
37
- MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
38
  fasttext_model = None
39
  tokenizer = None
 
 
 
40
 
41
  # CSS
42
  css = """
@@ -51,26 +52,29 @@ css = """
51
  }
52
  """
53
 
54
- # HTML templates
55
- TITLE = """
56
- <div style="text-align: center; margin-bottom: 30px;">
57
- <h1 style="font-size: 36px; margin-bottom: 10px;">Create your own Dataset Quality Scores, blazingly fast ⚡!</h1>
58
- <p style="font-size: 16px; color: #666;">The space takes a HF dataset as input, scores it and provides statistics and quality distribution.</p>
59
- </div>
60
- """
61
-
62
- DESCRIPTION_MD = """
63
- ### 📋 How it works:
64
- 1. Choose a dataset from Hugging Face Hub.
65
- 2. The Ultra-FineWeb classifier will score each text sample.
66
- 3. View quality distribution and download the scored dataset.
67
- 4. Optionally, upload the results to a new repository on your Hugging Face account.
68
 
69
- **Note:** The first run will download the model (~347MB), which may take a moment.
70
- """
 
 
 
 
 
 
71
 
72
  # --- Helper Functions ---
73
  def escape(s: str) -> str:
 
74
  s = str(s)
75
  s = s.replace("&", "&")
76
  s = s.replace("<", "<")
@@ -102,15 +106,10 @@ def fasttext_infer(norm_content: str, model) -> Tuple[str, float]:
102
  print(f"Error in fasttext_infer: {e}")
103
  return "__label__neg", 0.0
104
 
105
- # ==============================================================================
106
- # --- HATAYI GİDEREN KESİN VE NİHAİ DÜZELTME BURADA ---
107
- # `MODEL_LOADED` bayrağı kaldırıldı. Artık doğrudan değişkenlerin dolu olup olmadığı kontrol ediliyor.
108
- # ==============================================================================
109
  def load_models():
110
- """Load models into global variables, returning True on success, False on failure."""
111
  global fasttext_model, tokenizer
112
 
113
- # Bayrak yerine doğrudan değişkenleri kontrol et. Bu en güvenli yöntemdir.
114
  if tokenizer is not None and fasttext_model is not None:
115
  return True
116
 
@@ -131,7 +130,6 @@ def load_models():
131
  return True
132
  except Exception as e:
133
  print(f"Error loading models: {e}")
134
- # Hata durumunda değişkenleri tekrar None yap ki bir sonraki sefer yeniden yüklensin.
135
  tokenizer = None
136
  fasttext_model = None
137
  gr.Warning(f"Failed to load models: {e}")
@@ -170,7 +168,6 @@ def process_dataset(
170
  try:
171
  yield update_log("Starting process...")
172
  yield update_log("Loading scoring models...")
173
- # Düzeltilmiş ve artık doğru çalışan kontrol mekanizması
174
  if not load_models():
175
  raise gr.Error("Failed to load scoring models. Please check logs.")
176
  yield update_log("Models loaded successfully.")
 
33
  HF_TOKEN = os.environ.get("HF_TOKEN")
34
 
35
  # Global variables for model caching
 
 
36
  fasttext_model = None
37
  tokenizer = None
38
+ MODEL_CACHE_DIR = Path.home() / ".cache" / "ultra_fineweb"
39
+ MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
40
+
41
 
42
  # CSS
43
  css = """
 
52
  }
53
  """
54
 
55
+ # ==============================================================================
56
+ # --- HATAYI GİDEREN KESİN VE NİHAİ DÜZELTME BURADA ---
57
+ # Tüm çok satırlı metin blokları (""") kaldırıldı ve standart string'lere dönüştürüldü.
58
+ # ==============================================================================
59
+ TITLE = (
60
+ '<div style="text-align: center; margin-bottom: 30px;">'
61
+ '<h1 style="font-size: 36px; margin-bottom: 10px;">Create your own Dataset Quality Scores, blazingly fast ⚡!</h1>'
62
+ '<p style="font-size: 16px; color: #666;">The space takes a HF dataset as input, scores it and provides statistics and quality distribution.</p>'
63
+ '</div>'
64
+ )
 
 
 
 
65
 
66
+ DESCRIPTION_MD = (
67
+ "### 📋 How it works:\n"
68
+ "1. Choose a dataset from Hugging Face Hub.\n"
69
+ "2. The Ultra-FineWeb classifier will score each text sample.\n"
70
+ "3. View quality distribution and download the scored dataset.\n"
71
+ "4. Optionally, upload the results to a new repository on your Hugging Face account.\n\n"
72
+ "**Note:** The first run will download the model (~347MB), which may take a moment."
73
+ )
74
 
75
  # --- Helper Functions ---
76
  def escape(s: str) -> str:
77
+ # Escape special characters for safe HTML display.
78
  s = str(s)
79
  s = s.replace("&", "&")
80
  s = s.replace("<", "<")
 
106
  print(f"Error in fasttext_infer: {e}")
107
  return "__label__neg", 0.0
108
 
 
 
 
 
109
  def load_models():
110
+ # Load models into global variables, returning True on success, False on failure.
111
  global fasttext_model, tokenizer
112
 
 
113
  if tokenizer is not None and fasttext_model is not None:
114
  return True
115
 
 
130
  return True
131
  except Exception as e:
132
  print(f"Error loading models: {e}")
 
133
  tokenizer = None
134
  fasttext_model = None
135
  gr.Warning(f"Failed to load models: {e}")
 
168
  try:
169
  yield update_log("Starting process...")
170
  yield update_log("Loading scoring models...")
 
171
  if not load_models():
172
  raise gr.Error("Failed to load scoring models. Please check logs.")
173
  yield update_log("Models loaded successfully.")