Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "5000"))
|
|
8 |
RANDOM_STATE = 42
|
9 |
DEFAULT_INPUT = "I am so happy with this product"
|
10 |
|
11 |
-
# -------- Text cleaning
|
12 |
def clean_text(text: str) -> str:
|
13 |
text = text.lower()
|
14 |
text = re.sub(r"http\S+", "", text)
|
@@ -57,8 +57,9 @@ def ensure_corpus_embeddings(model_name: str, texts: list[str]):
|
|
57 |
return _CORPUS_CACHE[model_name]
|
58 |
model_id = EMBEDDERS[model_name]
|
59 |
model = load_sentence_model(model_id)
|
60 |
-
emb = model.encode(
|
61 |
-
|
|
|
62 |
_CORPUS_CACHE[model_name] = emb
|
63 |
return emb
|
64 |
|
@@ -79,10 +80,13 @@ def top3_for_each_model(user_input: str, selected_models: list[str]):
|
|
79 |
"Rank": rank,
|
80 |
"Similarity": float(sims[i]),
|
81 |
"Tweet (clean)": texts[i],
|
82 |
-
"Tweet (orig)": df.loc[i, "text"]
|
83 |
})
|
84 |
except Exception as e:
|
85 |
-
rows.append({
|
|
|
|
|
|
|
86 |
out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
|
87 |
return out
|
88 |
|
@@ -94,7 +98,7 @@ def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int, tempe
|
|
94 |
num_return_sequences=n_sequences,
|
95 |
do_sample=True,
|
96 |
temperature=temperature,
|
97 |
-
pad_token_id=50256,
|
98 |
)
|
99 |
candidates = [o["generated_text"].strip() for o in outputs]
|
100 |
|
@@ -105,7 +109,7 @@ def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int, tempe
|
|
105 |
sims = cosine_similarity(q, cand_vecs)[0]
|
106 |
best_idx = int(sims.argmax())
|
107 |
table = pd.DataFrame({
|
108 |
-
"Rank": np.argsort(-sims)+1,
|
109 |
"Similarity": np.sort(sims)[::-1],
|
110 |
"Generated Tweet": [c for _, c in sorted(zip(-sims, candidates))]
|
111 |
})
|
@@ -113,15 +117,12 @@ def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int, tempe
|
|
113 |
best_score = float(sims[best_idx])
|
114 |
return best, best_score, table
|
115 |
|
|
|
116 |
with gr.Blocks(title="Sentiment140 Embeddings + Generation") as demo:
|
117 |
gr.Markdown(
|
118 |
"""
|
119 |
# 🧪 Sentiment140 — Embeddings & Tweet Generator
|
120 |
-
|
121 |
-
1) Compare top-3 most similar tweets from **Sentiment140** across embedding models.
|
122 |
-
2) Generate synthetic tweets with **DistilGPT‑2** and auto‑pick the best by semantic similarity.
|
123 |
-
|
124 |
-
> Tip: Start with **MiniLM (fast)** on CPU Spaces. Add MPNet/DistilRoBERTa if you have a GPU.
|
125 |
"""
|
126 |
)
|
127 |
|
@@ -130,16 +131,16 @@ Small, reliable demo for your final project:
|
|
130 |
models = gr.CheckboxGroup(
|
131 |
choices=list(EMBEDDERS.keys()),
|
132 |
value=["MiniLM (fast)"],
|
133 |
-
label="Embedding models to compare"
|
134 |
)
|
135 |
|
136 |
run_btn = gr.Button("🔎 Find Top‑3 Similar Tweets")
|
137 |
-
table_out = gr.Dataframe(interactive=False
|
138 |
|
139 |
run_btn.click(top3_for_each_model, inputs=[test_input, models], outputs=table_out)
|
140 |
|
141 |
gr.Markdown("---")
|
142 |
-
gr.Markdown("## 📝 Generate Tweets and Pick the Best
|
143 |
|
144 |
with gr.Row():
|
145 |
n_seq = gr.Slider(3, 15, value=8, step=1, label="Number of candidates")
|
@@ -150,14 +151,16 @@ Small, reliable demo for your final project:
|
|
150 |
gen_btn = gr.Button("✨ Generate & Score")
|
151 |
best_txt = gr.Textbox(label="Best generated tweet")
|
152 |
best_score = gr.Number(label="Similarity (best)")
|
153 |
-
gen_table = gr.Dataframe(interactive=False
|
154 |
|
155 |
-
gen_btn.click(
|
156 |
-
|
157 |
-
|
|
|
|
|
158 |
|
159 |
gr.Markdown("---")
|
160 |
-
gr.Markdown("## 🖼️ Project Photo (optional
|
161 |
photo = gr.Image(label="Upload your project photo (jpg/png)", type="filepath")
|
162 |
|
163 |
demo.queue(max_size=32).launch()
|
|
|
8 |
RANDOM_STATE = 42
|
9 |
DEFAULT_INPUT = "I am so happy with this product"
|
10 |
|
11 |
+
# -------- Text cleaning --------
|
12 |
def clean_text(text: str) -> str:
|
13 |
text = text.lower()
|
14 |
text = re.sub(r"http\S+", "", text)
|
|
|
57 |
return _CORPUS_CACHE[model_name]
|
58 |
model_id = EMBEDDERS[model_name]
|
59 |
model = load_sentence_model(model_id)
|
60 |
+
emb = model.encode(
|
61 |
+
texts, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True
|
62 |
+
)
|
63 |
_CORPUS_CACHE[model_name] = emb
|
64 |
return emb
|
65 |
|
|
|
80 |
"Rank": rank,
|
81 |
"Similarity": float(sims[i]),
|
82 |
"Tweet (clean)": texts[i],
|
83 |
+
"Tweet (orig)": df.loc[i, "text"],
|
84 |
})
|
85 |
except Exception as e:
|
86 |
+
rows.append({
|
87 |
+
"Model": name, "Rank": "-", "Similarity": "-",
|
88 |
+
"Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""
|
89 |
+
})
|
90 |
out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
|
91 |
return out
|
92 |
|
|
|
98 |
num_return_sequences=n_sequences,
|
99 |
do_sample=True,
|
100 |
temperature=temperature,
|
101 |
+
pad_token_id=50256, # silence warning
|
102 |
)
|
103 |
candidates = [o["generated_text"].strip() for o in outputs]
|
104 |
|
|
|
109 |
sims = cosine_similarity(q, cand_vecs)[0]
|
110 |
best_idx = int(sims.argmax())
|
111 |
table = pd.DataFrame({
|
112 |
+
"Rank": np.argsort(-sims) + 1,
|
113 |
"Similarity": np.sort(sims)[::-1],
|
114 |
"Generated Tweet": [c for _, c in sorted(zip(-sims, candidates))]
|
115 |
})
|
|
|
117 |
best_score = float(sims[best_idx])
|
118 |
return best, best_score, table
|
119 |
|
120 |
+
# ---------------- UI ----------------
|
121 |
with gr.Blocks(title="Sentiment140 Embeddings + Generation") as demo:
|
122 |
gr.Markdown(
|
123 |
"""
|
124 |
# 🧪 Sentiment140 — Embeddings & Tweet Generator
|
125 |
+
Type a tweet, get similar tweets from Sentiment140, and generate a new one.
|
|
|
|
|
|
|
|
|
126 |
"""
|
127 |
)
|
128 |
|
|
|
131 |
models = gr.CheckboxGroup(
|
132 |
choices=list(EMBEDDERS.keys()),
|
133 |
value=["MiniLM (fast)"],
|
134 |
+
label="Embedding models to compare",
|
135 |
)
|
136 |
|
137 |
run_btn = gr.Button("🔎 Find Top‑3 Similar Tweets")
|
138 |
+
table_out = gr.Dataframe(interactive=False) # simple & compatible
|
139 |
|
140 |
run_btn.click(top3_for_each_model, inputs=[test_input, models], outputs=table_out)
|
141 |
|
142 |
gr.Markdown("---")
|
143 |
+
gr.Markdown("## 📝 Generate Tweets and Pick the Best")
|
144 |
|
145 |
with gr.Row():
|
146 |
n_seq = gr.Slider(3, 15, value=8, step=1, label="Number of candidates")
|
|
|
151 |
gen_btn = gr.Button("✨ Generate & Score")
|
152 |
best_txt = gr.Textbox(label="Best generated tweet")
|
153 |
best_score = gr.Number(label="Similarity (best)")
|
154 |
+
gen_table = gr.Dataframe(interactive=False)
|
155 |
|
156 |
+
gen_btn.click(
|
157 |
+
generate_and_pick_best,
|
158 |
+
inputs=[test_input, n_seq, max_len, temp, scorer_model],
|
159 |
+
outputs=[best_txt, best_score, gen_table],
|
160 |
+
)
|
161 |
|
162 |
gr.Markdown("---")
|
163 |
+
gr.Markdown("## 🖼️ Project Photo (optional)")
|
164 |
photo = gr.Image(label="Upload your project photo (jpg/png)", type="filepath")
|
165 |
|
166 |
demo.queue(max_size=32).launch()
|