Salimtoama15 commited on
Commit
61b06dc
·
verified ·
1 Parent(s): a9383ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -9
app.py CHANGED
@@ -8,7 +8,7 @@ SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "5000"))
8
  RANDOM_STATE = 42
9
  DEFAULT_INPUT = "I am so happy with this product"
10
 
11
- # -------- Text cleaning --------
12
  def clean_text(text: str) -> str:
13
  text = text.lower()
14
  text = re.sub(r"http\S+", "", text)
@@ -18,6 +18,9 @@ def clean_text(text: str) -> str:
18
  text = re.sub(r"\s+", " ", text).strip()
19
  return text
20
 
 
 
 
21
  # -------- Load sample data once --------
22
  @functools.lru_cache(maxsize=1)
23
  def load_sample_df():
@@ -57,9 +60,8 @@ def ensure_corpus_embeddings(model_name: str, texts: list[str]):
57
  return _CORPUS_CACHE[model_name]
58
  model_id = EMBEDDERS[model_name]
59
  model = load_sentence_model(model_id)
60
- emb = model.encode(
61
- texts, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True
62
- )
63
  _CORPUS_CACHE[model_name] = emb
64
  return emb
65
 
@@ -71,7 +73,8 @@ def top3_for_each_model(user_input: str, selected_models: list[str]):
71
  try:
72
  model = load_sentence_model(EMBEDDERS[name])
73
  corpus_emb = ensure_corpus_embeddings(name, texts)
74
- q = model.encode([clean_text(user_input)], show_progress_bar=False, normalize_embeddings=True)
 
75
  sims = cosine_similarity(q, corpus_emb)[0]
76
  top_idx = sims.argsort()[-3:][::-1]
77
  for rank, i in enumerate(top_idx, start=1):
@@ -84,7 +87,7 @@ def top3_for_each_model(user_input: str, selected_models: list[str]):
84
  })
85
  except Exception as e:
86
  rows.append({
87
- "Model": name, "Rank": "-", "Similarity": "-",
88
  "Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""
89
  })
90
  out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
@@ -104,8 +107,8 @@ def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int, tempe
104
 
105
  scorer_id = EMBEDDERS[scorer_model_name]
106
  scorer = load_sentence_model(scorer_id)
107
- q = scorer.encode([prompt], show_progress_bar=False, normalize_embeddings=True)
108
- cand_vecs = scorer.encode(candidates, show_progress_bar=False, normalize_embeddings=True)
109
  sims = cosine_similarity(q, cand_vecs)[0]
110
  best_idx = int(sims.argmax())
111
  table = pd.DataFrame({
@@ -135,7 +138,7 @@ Type a tweet, get similar tweets from Sentiment140, and generate a new one.
135
  )
136
 
137
  run_btn = gr.Button("🔎 Find Top‑3 Similar Tweets")
138
- table_out = gr.Dataframe(interactive=False) # simple & compatible
139
 
140
  run_btn.click(top3_for_each_model, inputs=[test_input, models], outputs=table_out)
141
 
 
8
  RANDOM_STATE = 42
9
  DEFAULT_INPUT = "I am so happy with this product"
10
 
11
+ # -------- Helpers --------
12
  def clean_text(text: str) -> str:
13
  text = text.lower()
14
  text = re.sub(r"http\S+", "", text)
 
18
  text = re.sub(r"\s+", " ", text).strip()
19
  return text
20
 
21
+ def _l2norm(x: np.ndarray) -> np.ndarray:
22
+ return x / (np.linalg.norm(x, axis=1, keepdims=True) + 1e-12)
23
+
24
  # -------- Load sample data once --------
25
  @functools.lru_cache(maxsize=1)
26
  def load_sample_df():
 
60
  return _CORPUS_CACHE[model_name]
61
  model_id = EMBEDDERS[model_name]
62
  model = load_sentence_model(model_id)
63
+ emb = model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
64
+ emb = _l2norm(emb)
 
65
  _CORPUS_CACHE[model_name] = emb
66
  return emb
67
 
 
73
  try:
74
  model = load_sentence_model(EMBEDDERS[name])
75
  corpus_emb = ensure_corpus_embeddings(name, texts)
76
+ q = model.encode([clean_text(user_input)], show_progress_bar=False, convert_to_numpy=True)
77
+ q = _l2norm(q)
78
  sims = cosine_similarity(q, corpus_emb)[0]
79
  top_idx = sims.argsort()[-3:][::-1]
80
  for rank, i in enumerate(top_idx, start=1):
 
87
  })
88
  except Exception as e:
89
  rows.append({
90
+ "Model": name, "Rank": "-", "Similarity": "-",
91
  "Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""
92
  })
93
  out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
 
107
 
108
  scorer_id = EMBEDDERS[scorer_model_name]
109
  scorer = load_sentence_model(scorer_id)
110
+ q = scorer.encode([prompt], show_progress_bar=False, convert_to_numpy=True); q = _l2norm(q)
111
+ cand_vecs = scorer.encode(candidates, show_progress_bar=False, convert_to_numpy=True); cand_vecs = _l2norm(cand_vecs)
112
  sims = cosine_similarity(q, cand_vecs)[0]
113
  best_idx = int(sims.argmax())
114
  table = pd.DataFrame({
 
138
  )
139
 
140
  run_btn = gr.Button("🔎 Find Top‑3 Similar Tweets")
141
+ table_out = gr.Dataframe(interactive=False)
142
 
143
  run_btn.click(top3_for_each_model, inputs=[test_input, models], outputs=table_out)
144