Salimtoama15 commited on
Commit
a9383ab
·
verified ·
1 Parent(s): 839d7f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -20
app.py CHANGED
@@ -8,7 +8,7 @@ SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "5000"))
8
  RANDOM_STATE = 42
9
  DEFAULT_INPUT = "I am so happy with this product"
10
 
11
- # -------- Text cleaning (yours) --------
12
  def clean_text(text: str) -> str:
13
  text = text.lower()
14
  text = re.sub(r"http\S+", "", text)
@@ -57,8 +57,9 @@ def ensure_corpus_embeddings(model_name: str, texts: list[str]):
57
  return _CORPUS_CACHE[model_name]
58
  model_id = EMBEDDERS[model_name]
59
  model = load_sentence_model(model_id)
60
- emb = model.encode(texts, show_progress_bar=False,
61
- convert_to_numpy=True, normalize_embeddings=True)
 
62
  _CORPUS_CACHE[model_name] = emb
63
  return emb
64
 
@@ -79,10 +80,13 @@ def top3_for_each_model(user_input: str, selected_models: list[str]):
79
  "Rank": rank,
80
  "Similarity": float(sims[i]),
81
  "Tweet (clean)": texts[i],
82
- "Tweet (orig)": df.loc[i, "text"]
83
  })
84
  except Exception as e:
85
- rows.append({"Model": name, "Rank": "-", "Similarity": "-", "Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""})
 
 
 
86
  out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
87
  return out
88
 
@@ -94,7 +98,7 @@ def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int, tempe
94
  num_return_sequences=n_sequences,
95
  do_sample=True,
96
  temperature=temperature,
97
- pad_token_id=50256, # <- added
98
  )
99
  candidates = [o["generated_text"].strip() for o in outputs]
100
 
@@ -105,7 +109,7 @@ def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int, tempe
105
  sims = cosine_similarity(q, cand_vecs)[0]
106
  best_idx = int(sims.argmax())
107
  table = pd.DataFrame({
108
- "Rank": np.argsort(-sims)+1,
109
  "Similarity": np.sort(sims)[::-1],
110
  "Generated Tweet": [c for _, c in sorted(zip(-sims, candidates))]
111
  })
@@ -113,15 +117,12 @@ def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int, tempe
113
  best_score = float(sims[best_idx])
114
  return best, best_score, table
115
 
 
116
  with gr.Blocks(title="Sentiment140 Embeddings + Generation") as demo:
117
  gr.Markdown(
118
  """
119
  # 🧪 Sentiment140 — Embeddings & Tweet Generator
120
- Small, reliable demo for your final project:
121
- 1) Compare top-3 most similar tweets from **Sentiment140** across embedding models.
122
- 2) Generate synthetic tweets with **DistilGPT‑2** and auto‑pick the best by semantic similarity.
123
-
124
- > Tip: Start with **MiniLM (fast)** on CPU Spaces. Add MPNet/DistilRoBERTa if you have a GPU.
125
  """
126
  )
127
 
@@ -130,16 +131,16 @@ Small, reliable demo for your final project:
130
  models = gr.CheckboxGroup(
131
  choices=list(EMBEDDERS.keys()),
132
  value=["MiniLM (fast)"],
133
- label="Embedding models to compare"
134
  )
135
 
136
  run_btn = gr.Button("🔎 Find Top‑3 Similar Tweets")
137
- table_out = gr.Dataframe(interactive=False, overflow_row_behaviour="paginate") # <- changed
138
 
139
  run_btn.click(top3_for_each_model, inputs=[test_input, models], outputs=table_out)
140
 
141
  gr.Markdown("---")
142
- gr.Markdown("## 📝 Generate Tweets and Pick the Best (by similarity to your input)")
143
 
144
  with gr.Row():
145
  n_seq = gr.Slider(3, 15, value=8, step=1, label="Number of candidates")
@@ -150,14 +151,16 @@ Small, reliable demo for your final project:
150
  gen_btn = gr.Button("✨ Generate & Score")
151
  best_txt = gr.Textbox(label="Best generated tweet")
152
  best_score = gr.Number(label="Similarity (best)")
153
- gen_table = gr.Dataframe(interactive=False, overflow_row_behaviour="paginate") # <- changed
154
 
155
- gen_btn.click(generate_and_pick_best,
156
- inputs=[test_input, n_seq, max_len, temp, scorer_model],
157
- outputs=[best_txt, best_score, gen_table])
 
 
158
 
159
  gr.Markdown("---")
160
- gr.Markdown("## 🖼️ Project Photo (optional, just to display it in the app)")
161
  photo = gr.Image(label="Upload your project photo (jpg/png)", type="filepath")
162
 
163
  demo.queue(max_size=32).launch()
 
8
  RANDOM_STATE = 42
9
  DEFAULT_INPUT = "I am so happy with this product"
10
 
11
+ # -------- Text cleaning --------
12
  def clean_text(text: str) -> str:
13
  text = text.lower()
14
  text = re.sub(r"http\S+", "", text)
 
57
  return _CORPUS_CACHE[model_name]
58
  model_id = EMBEDDERS[model_name]
59
  model = load_sentence_model(model_id)
60
+ emb = model.encode(
61
+ texts, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True
62
+ )
63
  _CORPUS_CACHE[model_name] = emb
64
  return emb
65
 
 
80
  "Rank": rank,
81
  "Similarity": float(sims[i]),
82
  "Tweet (clean)": texts[i],
83
+ "Tweet (orig)": df.loc[i, "text"],
84
  })
85
  except Exception as e:
86
+ rows.append({
87
+ "Model": name, "Rank": "-", "Similarity": "-",
88
+ "Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""
89
+ })
90
  out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
91
  return out
92
 
 
98
  num_return_sequences=n_sequences,
99
  do_sample=True,
100
  temperature=temperature,
101
+ pad_token_id=50256, # silence warning
102
  )
103
  candidates = [o["generated_text"].strip() for o in outputs]
104
 
 
109
  sims = cosine_similarity(q, cand_vecs)[0]
110
  best_idx = int(sims.argmax())
111
  table = pd.DataFrame({
112
+ "Rank": np.argsort(-sims) + 1,
113
  "Similarity": np.sort(sims)[::-1],
114
  "Generated Tweet": [c for _, c in sorted(zip(-sims, candidates))]
115
  })
 
117
  best_score = float(sims[best_idx])
118
  return best, best_score, table
119
 
120
+ # ---------------- UI ----------------
121
  with gr.Blocks(title="Sentiment140 Embeddings + Generation") as demo:
122
  gr.Markdown(
123
  """
124
  # 🧪 Sentiment140 — Embeddings & Tweet Generator
125
+ Type a tweet, get similar tweets from Sentiment140, and generate a new one.
 
 
 
 
126
  """
127
  )
128
 
 
131
  models = gr.CheckboxGroup(
132
  choices=list(EMBEDDERS.keys()),
133
  value=["MiniLM (fast)"],
134
+ label="Embedding models to compare",
135
  )
136
 
137
  run_btn = gr.Button("🔎 Find Top‑3 Similar Tweets")
138
+ table_out = gr.Dataframe(interactive=False) # simple & compatible
139
 
140
  run_btn.click(top3_for_each_model, inputs=[test_input, models], outputs=table_out)
141
 
142
  gr.Markdown("---")
143
+ gr.Markdown("## 📝 Generate Tweets and Pick the Best")
144
 
145
  with gr.Row():
146
  n_seq = gr.Slider(3, 15, value=8, step=1, label="Number of candidates")
 
151
  gen_btn = gr.Button("✨ Generate & Score")
152
  best_txt = gr.Textbox(label="Best generated tweet")
153
  best_score = gr.Number(label="Similarity (best)")
154
+ gen_table = gr.Dataframe(interactive=False)
155
 
156
+ gen_btn.click(
157
+ generate_and_pick_best,
158
+ inputs=[test_input, n_seq, max_len, temp, scorer_model],
159
+ outputs=[best_txt, best_score, gen_table],
160
+ )
161
 
162
  gr.Markdown("---")
163
+ gr.Markdown("## 🖼️ Project Photo (optional)")
164
  photo = gr.Image(label="Upload your project photo (jpg/png)", type="filepath")
165
 
166
  demo.queue(max_size=32).launch()