Salimtoama15 commited on
Commit
81179cf
·
verified ·
1 Parent(s): 61a3f6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -14
app.py CHANGED
@@ -1,16 +1,17 @@
 
1
  import os, re, functools, numpy as np, pandas as pd
2
  import gradio as gr
3
  from datasets import load_dataset
4
  from sklearn.metrics.pairwise import cosine_similarity
5
 
6
  # -------- Config --------
7
- SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "5000"))
8
  RANDOM_STATE = 42
9
  DEFAULT_INPUT = "I am so happy with this product"
10
 
11
  # -------- Helpers --------
12
  def clean_text(text: str) -> str:
13
- text = text.lower()
14
  text = re.sub(r"http\S+", "", text)
15
  text = re.sub(r"@\w+", "", text)
16
  text = re.sub(r"#\w+", "", text)
@@ -33,16 +34,18 @@ def _l2norm(x: np.ndarray) -> np.ndarray:
33
  x = x.reshape(1, -1)
34
  return x / (np.linalg.norm(x, axis=1, keepdims=True) + 1e-12)
35
 
36
- # -------- Load sample data once --------
37
  @functools.lru_cache(maxsize=1)
38
  def load_sample_df():
39
- ds = load_dataset("sentiment140", split="train")
 
40
  df = ds.to_pandas()
 
41
  df = df.dropna(subset=["text", "sentiment"]).copy()
42
  df["text_length"] = df["text"].str.len()
43
  df = df[(df["text_length"] >= 5) & (df["text_length"] <= 280)].copy()
44
  df["clean_text"] = df["text"].apply(clean_text)
45
- df = df.sample(min(SAMPLE_SIZE, len(df)), random_state=RANDOM_STATE).reset_index(drop=True)
46
  return df[["text", "clean_text"]]
47
 
48
  # -------- Lazy model loaders --------
@@ -68,7 +71,7 @@ EMBEDDERS = {
68
  _CORPUS_CACHE = {}
69
 
70
  def _encode_norm(model, texts):
71
- """Encode with any sentence-transformers version and return (n,d) L2-normalized numpy."""
72
  out = model.encode(texts, show_progress_bar=False)
73
  out = _to_numpy(out)
74
  return _l2norm(out)
@@ -81,6 +84,7 @@ def ensure_corpus_embeddings(model_name: str, texts: list):
81
  _CORPUS_CACHE[model_name] = emb
82
  return emb
83
 
 
84
  def top3_for_each_model(user_input: str, selected_models: list):
85
  df = load_sample_df()
86
  texts = df["clean_text"].tolist()
@@ -107,7 +111,7 @@ def top3_for_each_model(user_input: str, selected_models: list):
107
  })
108
  return pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
109
 
110
- # -------- Generator + scoring (with progress) --------
111
  def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int,
112
  temperature: float, scorer_model_name: str,
113
  progress=gr.Progress()):
@@ -166,9 +170,9 @@ Type a tweet, get similar tweets from Sentiment140, and generate a new one.
166
  gr.Markdown("## 📝 Generate Tweets and Pick the Best")
167
 
168
  with gr.Row():
169
- n_seq = gr.Slider(3, 15, value=8, step=1, label="Number of candidates")
170
- max_len = gr.Slider(30, 120, value=60, step=1, label="Max length (new tokens)")
171
- temp = gr.Slider(0.5, 1.5, value=0.9, step=0.05, label="Temperature")
172
  scorer_model = gr.Dropdown(list(EMBEDDERS.keys()), value="MiniLM (fast)", label="Scorer embedding")
173
 
174
  gen_btn = gr.Button("✨ Generate & Score")
@@ -182,8 +186,4 @@ Type a tweet, get similar tweets from Sentiment140, and generate a new one.
182
  outputs=[best_txt, best_score, gen_table],
183
  )
184
 
185
- gr.Markdown("---")
186
- gr.Markdown("## 🖼️ Project Photo (optional)")
187
- photo = gr.Image(label="Upload your project photo (jpg/png)", type="filepath")
188
-
189
  demo.queue(max_size=32).launch()
 
1
+ # app.py
2
  import os, re, functools, numpy as np, pandas as pd
3
  import gradio as gr
4
  from datasets import load_dataset
5
  from sklearn.metrics.pairwise import cosine_similarity
6
 
7
  # -------- Config --------
8
+ SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "3000")) # small by default for CPU Spaces
9
  RANDOM_STATE = 42
10
  DEFAULT_INPUT = "I am so happy with this product"
11
 
12
  # -------- Helpers --------
13
  def clean_text(text: str) -> str:
14
+ text = (text or "").lower()
15
  text = re.sub(r"http\S+", "", text)
16
  text = re.sub(r"@\w+", "", text)
17
  text = re.sub(r"#\w+", "", text)
 
34
  x = x.reshape(1, -1)
35
  return x / (np.linalg.norm(x, axis=1, keepdims=True) + 1e-12)
36
 
37
+ # -------- Load sample data once (FAST: only a slice) --------
38
  @functools.lru_cache(maxsize=1)
39
  def load_sample_df():
40
+ # Load only a slice (e.g., first 3000 rows) instead of the full 1.6M
41
+ ds = load_dataset("sentiment140", split=f"train[:{SAMPLE_SIZE}]")
42
  df = ds.to_pandas()
43
+
44
  df = df.dropna(subset=["text", "sentiment"]).copy()
45
  df["text_length"] = df["text"].str.len()
46
  df = df[(df["text_length"] >= 5) & (df["text_length"] <= 280)].copy()
47
  df["clean_text"] = df["text"].apply(clean_text)
48
+ df = df.sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)
49
  return df[["text", "clean_text"]]
50
 
51
  # -------- Lazy model loaders --------
 
71
  _CORPUS_CACHE = {}
72
 
73
  def _encode_norm(model, texts):
74
+ """Encode compatibly across sentence-transformers versions; return L2-normalized numpy (n,d)."""
75
  out = model.encode(texts, show_progress_bar=False)
76
  out = _to_numpy(out)
77
  return _l2norm(out)
 
84
  _CORPUS_CACHE[model_name] = emb
85
  return emb
86
 
87
+ # -------- Retrieval --------
88
  def top3_for_each_model(user_input: str, selected_models: list):
89
  df = load_sample_df()
90
  texts = df["clean_text"].tolist()
 
111
  })
112
  return pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
113
 
114
+ # -------- Generation + scoring (with progress) --------
115
  def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int,
116
  temperature: float, scorer_model_name: str,
117
  progress=gr.Progress()):
 
170
  gr.Markdown("## 📝 Generate Tweets and Pick the Best")
171
 
172
  with gr.Row():
173
+ n_seq = gr.Slider(1, 8, value=4, step=1, label="Number of candidates")
174
+ max_len = gr.Slider(20, 80, value=40, step=1, label="Max length (new tokens)")
175
+ temp = gr.Slider(0.7, 1.3, value=0.9, step=0.05, label="Temperature")
176
  scorer_model = gr.Dropdown(list(EMBEDDERS.keys()), value="MiniLM (fast)", label="Scorer embedding")
177
 
178
  gen_btn = gr.Button("✨ Generate & Score")
 
186
  outputs=[best_txt, best_score, gen_table],
187
  )
188
 
 
 
 
 
189
  demo.queue(max_size=32).launch()