Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "5000"))
|
|
8 |
RANDOM_STATE = 42
|
9 |
DEFAULT_INPUT = "I am so happy with this product"
|
10 |
|
11 |
-
# --------
|
12 |
def clean_text(text: str) -> str:
|
13 |
text = text.lower()
|
14 |
text = re.sub(r"http\S+", "", text)
|
@@ -18,6 +18,9 @@ def clean_text(text: str) -> str:
|
|
18 |
text = re.sub(r"\s+", " ", text).strip()
|
19 |
return text
|
20 |
|
|
|
|
|
|
|
21 |
# -------- Load sample data once --------
|
22 |
@functools.lru_cache(maxsize=1)
|
23 |
def load_sample_df():
|
@@ -57,9 +60,8 @@ def ensure_corpus_embeddings(model_name: str, texts: list[str]):
|
|
57 |
return _CORPUS_CACHE[model_name]
|
58 |
model_id = EMBEDDERS[model_name]
|
59 |
model = load_sentence_model(model_id)
|
60 |
-
emb = model.encode(
|
61 |
-
|
62 |
-
)
|
63 |
_CORPUS_CACHE[model_name] = emb
|
64 |
return emb
|
65 |
|
@@ -71,7 +73,8 @@ def top3_for_each_model(user_input: str, selected_models: list[str]):
|
|
71 |
try:
|
72 |
model = load_sentence_model(EMBEDDERS[name])
|
73 |
corpus_emb = ensure_corpus_embeddings(name, texts)
|
74 |
-
q = model.encode([clean_text(user_input)], show_progress_bar=False,
|
|
|
75 |
sims = cosine_similarity(q, corpus_emb)[0]
|
76 |
top_idx = sims.argsort()[-3:][::-1]
|
77 |
for rank, i in enumerate(top_idx, start=1):
|
@@ -84,7 +87,7 @@ def top3_for_each_model(user_input: str, selected_models: list[str]):
|
|
84 |
})
|
85 |
except Exception as e:
|
86 |
rows.append({
|
87 |
-
"Model": name, "Rank": "-", "Similarity": "-",
|
88 |
"Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""
|
89 |
})
|
90 |
out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
|
@@ -104,8 +107,8 @@ def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int, tempe
|
|
104 |
|
105 |
scorer_id = EMBEDDERS[scorer_model_name]
|
106 |
scorer = load_sentence_model(scorer_id)
|
107 |
-
q = scorer.encode([prompt], show_progress_bar=False,
|
108 |
-
cand_vecs = scorer.encode(candidates, show_progress_bar=False,
|
109 |
sims = cosine_similarity(q, cand_vecs)[0]
|
110 |
best_idx = int(sims.argmax())
|
111 |
table = pd.DataFrame({
|
@@ -135,7 +138,7 @@ Type a tweet, get similar tweets from Sentiment140, and generate a new one.
|
|
135 |
)
|
136 |
|
137 |
run_btn = gr.Button("🔎 Find Top‑3 Similar Tweets")
|
138 |
-
table_out = gr.Dataframe(interactive=False)
|
139 |
|
140 |
run_btn.click(top3_for_each_model, inputs=[test_input, models], outputs=table_out)
|
141 |
|
|
|
8 |
RANDOM_STATE = 42
|
9 |
DEFAULT_INPUT = "I am so happy with this product"
|
10 |
|
11 |
+
# -------- Helpers --------
|
12 |
def clean_text(text: str) -> str:
|
13 |
text = text.lower()
|
14 |
text = re.sub(r"http\S+", "", text)
|
|
|
18 |
text = re.sub(r"\s+", " ", text).strip()
|
19 |
return text
|
20 |
|
21 |
+
def _l2norm(x: np.ndarray) -> np.ndarray:
|
22 |
+
return x / (np.linalg.norm(x, axis=1, keepdims=True) + 1e-12)
|
23 |
+
|
24 |
# -------- Load sample data once --------
|
25 |
@functools.lru_cache(maxsize=1)
|
26 |
def load_sample_df():
|
|
|
60 |
return _CORPUS_CACHE[model_name]
|
61 |
model_id = EMBEDDERS[model_name]
|
62 |
model = load_sentence_model(model_id)
|
63 |
+
emb = model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
|
64 |
+
emb = _l2norm(emb)
|
|
|
65 |
_CORPUS_CACHE[model_name] = emb
|
66 |
return emb
|
67 |
|
|
|
73 |
try:
|
74 |
model = load_sentence_model(EMBEDDERS[name])
|
75 |
corpus_emb = ensure_corpus_embeddings(name, texts)
|
76 |
+
q = model.encode([clean_text(user_input)], show_progress_bar=False, convert_to_numpy=True)
|
77 |
+
q = _l2norm(q)
|
78 |
sims = cosine_similarity(q, corpus_emb)[0]
|
79 |
top_idx = sims.argsort()[-3:][::-1]
|
80 |
for rank, i in enumerate(top_idx, start=1):
|
|
|
87 |
})
|
88 |
except Exception as e:
|
89 |
rows.append({
|
90 |
+
"Model": name, "Rank": "-", "Similarity": "-",
|
91 |
"Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""
|
92 |
})
|
93 |
out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
|
|
|
107 |
|
108 |
scorer_id = EMBEDDERS[scorer_model_name]
|
109 |
scorer = load_sentence_model(scorer_id)
|
110 |
+
q = scorer.encode([prompt], show_progress_bar=False, convert_to_numpy=True); q = _l2norm(q)
|
111 |
+
cand_vecs = scorer.encode(candidates, show_progress_bar=False, convert_to_numpy=True); cand_vecs = _l2norm(cand_vecs)
|
112 |
sims = cosine_similarity(q, cand_vecs)[0]
|
113 |
best_idx = int(sims.argmax())
|
114 |
table = pd.DataFrame({
|
|
|
138 |
)
|
139 |
|
140 |
run_btn = gr.Button("🔎 Find Top‑3 Similar Tweets")
|
141 |
+
table_out = gr.Dataframe(interactive=False)
|
142 |
|
143 |
run_btn.click(top3_for_each_model, inputs=[test_input, models], outputs=table_out)
|
144 |
|