Salimtoama15 commited on
Commit
61a3f6e
·
verified ·
1 Parent(s): 61b06dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -26
app.py CHANGED
@@ -3,7 +3,7 @@ import gradio as gr
3
  from datasets import load_dataset
4
  from sklearn.metrics.pairwise import cosine_similarity
5
 
6
- # -------- Config (safe defaults for CPU Spaces) --------
7
  SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "5000"))
8
  RANDOM_STATE = 42
9
  DEFAULT_INPUT = "I am so happy with this product"
@@ -18,7 +18,19 @@ def clean_text(text: str) -> str:
18
  text = re.sub(r"\s+", " ", text).strip()
19
  return text
20
 
 
 
 
 
 
 
 
 
 
21
  def _l2norm(x: np.ndarray) -> np.ndarray:
 
 
 
22
  return x / (np.linalg.norm(x, axis=1, keepdims=True) + 1e-12)
23
 
24
  # -------- Load sample data once --------
@@ -45,27 +57,31 @@ def load_generator():
45
  set_seed(RANDOM_STATE)
46
  return pipeline("text-generation", model="distilgpt2")
47
 
48
- # Map names → HF ids
49
  EMBEDDERS = {
50
  "MiniLM (fast)": "sentence-transformers/all-MiniLM-L6-v2",
51
  "MPNet (heavier)": "sentence-transformers/all-mpnet-base-v2",
52
  "DistilRoBERTa (paraphrase)": "sentence-transformers/paraphrase-distilroberta-base-v1",
53
  }
54
 
55
- # Cache for precomputed corpus embeddings per model
56
  _CORPUS_CACHE = {}
57
 
58
- def ensure_corpus_embeddings(model_name: str, texts: list[str]):
 
 
 
 
 
 
59
  if model_name in _CORPUS_CACHE:
60
  return _CORPUS_CACHE[model_name]
61
- model_id = EMBEDDERS[model_name]
62
- model = load_sentence_model(model_id)
63
- emb = model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
64
- emb = _l2norm(emb)
65
  _CORPUS_CACHE[model_name] = emb
66
  return emb
67
 
68
- def top3_for_each_model(user_input: str, selected_models: list[str]):
69
  df = load_sample_df()
70
  texts = df["clean_text"].tolist()
71
  rows = []
@@ -73,8 +89,7 @@ def top3_for_each_model(user_input: str, selected_models: list[str]):
73
  try:
74
  model = load_sentence_model(EMBEDDERS[name])
75
  corpus_emb = ensure_corpus_embeddings(name, texts)
76
- q = model.encode([clean_text(user_input)], show_progress_bar=False, convert_to_numpy=True)
77
- q = _l2norm(q)
78
  sims = cosine_similarity(q, corpus_emb)[0]
79
  top_idx = sims.argsort()[-3:][::-1]
80
  for rank, i in enumerate(top_idx, start=1):
@@ -90,35 +105,40 @@ def top3_for_each_model(user_input: str, selected_models: list[str]):
90
  "Model": name, "Rank": "-", "Similarity": "-",
91
  "Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""
92
  })
93
- out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
94
- return out
95
 
96
- def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int, temperature: float, scorer_model_name: str):
 
 
 
 
97
  gen = load_generator()
 
 
 
98
  outputs = gen(
99
  prompt,
100
- max_length=max_length,
101
- num_return_sequences=n_sequences,
102
  do_sample=True,
103
- temperature=temperature,
104
- pad_token_id=50256, # silence warning
105
  )
106
  candidates = [o["generated_text"].strip() for o in outputs]
107
 
108
- scorer_id = EMBEDDERS[scorer_model_name]
109
- scorer = load_sentence_model(scorer_id)
110
- q = scorer.encode([prompt], show_progress_bar=False, convert_to_numpy=True); q = _l2norm(q)
111
- cand_vecs = scorer.encode(candidates, show_progress_bar=False, convert_to_numpy=True); cand_vecs = _l2norm(cand_vecs)
112
  sims = cosine_similarity(q, cand_vecs)[0]
113
  best_idx = int(sims.argmax())
 
114
  table = pd.DataFrame({
115
  "Rank": np.argsort(-sims) + 1,
116
  "Similarity": np.sort(sims)[::-1],
117
  "Generated Tweet": [c for _, c in sorted(zip(-sims, candidates))]
118
  })
119
- best = candidates[best_idx]
120
- best_score = float(sims[best_idx])
121
- return best, best_score, table
122
 
123
  # ---------------- UI ----------------
124
  with gr.Blocks(title="Sentiment140 Embeddings + Generation") as demo:
@@ -147,7 +167,7 @@ Type a tweet, get similar tweets from Sentiment140, and generate a new one.
147
 
148
  with gr.Row():
149
  n_seq = gr.Slider(3, 15, value=8, step=1, label="Number of candidates")
150
- max_len = gr.Slider(30, 120, value=60, step=1, label="Max length")
151
  temp = gr.Slider(0.5, 1.5, value=0.9, step=0.05, label="Temperature")
152
  scorer_model = gr.Dropdown(list(EMBEDDERS.keys()), value="MiniLM (fast)", label="Scorer embedding")
153
 
 
3
  from datasets import load_dataset
4
  from sklearn.metrics.pairwise import cosine_similarity
5
 
6
+ # -------- Config --------
7
  SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "5000"))
8
  RANDOM_STATE = 42
9
  DEFAULT_INPUT = "I am so happy with this product"
 
18
  text = re.sub(r"\s+", " ", text).strip()
19
  return text
20
 
21
+ def _to_numpy(x):
22
+ try:
23
+ import torch
24
+ if hasattr(torch, "Tensor") and isinstance(x, torch.Tensor):
25
+ return x.detach().cpu().numpy()
26
+ except Exception:
27
+ pass
28
+ return np.asarray(x)
29
+
30
  def _l2norm(x: np.ndarray) -> np.ndarray:
31
+ x = x.astype(np.float32, copy=False)
32
+ if x.ndim == 1:
33
+ x = x.reshape(1, -1)
34
  return x / (np.linalg.norm(x, axis=1, keepdims=True) + 1e-12)
35
 
36
  # -------- Load sample data once --------
 
57
  set_seed(RANDOM_STATE)
58
  return pipeline("text-generation", model="distilgpt2")
59
 
60
+ # HF model ids
61
  EMBEDDERS = {
62
  "MiniLM (fast)": "sentence-transformers/all-MiniLM-L6-v2",
63
  "MPNet (heavier)": "sentence-transformers/all-mpnet-base-v2",
64
  "DistilRoBERTa (paraphrase)": "sentence-transformers/paraphrase-distilroberta-base-v1",
65
  }
66
 
67
+ # Cache for corpus embeddings per model
68
  _CORPUS_CACHE = {}
69
 
70
+ def _encode_norm(model, texts):
71
+ """Encode with any sentence-transformers version and return (n,d) L2-normalized numpy."""
72
+ out = model.encode(texts, show_progress_bar=False)
73
+ out = _to_numpy(out)
74
+ return _l2norm(out)
75
+
76
+ def ensure_corpus_embeddings(model_name: str, texts: list):
77
  if model_name in _CORPUS_CACHE:
78
  return _CORPUS_CACHE[model_name]
79
+ model = load_sentence_model(EMBEDDERS[model_name])
80
+ emb = _encode_norm(model, texts)
 
 
81
  _CORPUS_CACHE[model_name] = emb
82
  return emb
83
 
84
+ def top3_for_each_model(user_input: str, selected_models: list):
85
  df = load_sample_df()
86
  texts = df["clean_text"].tolist()
87
  rows = []
 
89
  try:
90
  model = load_sentence_model(EMBEDDERS[name])
91
  corpus_emb = ensure_corpus_embeddings(name, texts)
92
+ q = _encode_norm(model, [clean_text(user_input)])
 
93
  sims = cosine_similarity(q, corpus_emb)[0]
94
  top_idx = sims.argsort()[-3:][::-1]
95
  for rank, i in enumerate(top_idx, start=1):
 
105
  "Model": name, "Rank": "-", "Similarity": "-",
106
  "Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""
107
  })
108
+ return pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
 
109
 
110
+ # -------- Generator + scoring (with progress) --------
111
+ def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int,
112
+ temperature: float, scorer_model_name: str,
113
+ progress=gr.Progress()):
114
+ progress(0.0, desc="Loading models…")
115
  gen = load_generator()
116
+ scorer = load_sentence_model(EMBEDDERS[scorer_model_name])
117
+
118
+ progress(0.3, desc="Generating candidates…")
119
  outputs = gen(
120
  prompt,
121
+ max_new_tokens=int(max_length), # number of NEW tokens to generate
122
+ num_return_sequences=int(n_sequences),
123
  do_sample=True,
124
+ temperature=float(temperature),
125
+ pad_token_id=50256,
126
  )
127
  candidates = [o["generated_text"].strip() for o in outputs]
128
 
129
+ progress(0.7, desc="Scoring candidates…")
130
+ q = _encode_norm(scorer, [prompt])
131
+ cand_vecs = _encode_norm(scorer, candidates)
 
132
  sims = cosine_similarity(q, cand_vecs)[0]
133
  best_idx = int(sims.argmax())
134
+
135
  table = pd.DataFrame({
136
  "Rank": np.argsort(-sims) + 1,
137
  "Similarity": np.sort(sims)[::-1],
138
  "Generated Tweet": [c for _, c in sorted(zip(-sims, candidates))]
139
  })
140
+ progress(1.0)
141
+ return candidates[best_idx], float(sims[best_idx]), table
 
142
 
143
  # ---------------- UI ----------------
144
  with gr.Blocks(title="Sentiment140 Embeddings + Generation") as demo:
 
167
 
168
  with gr.Row():
169
  n_seq = gr.Slider(3, 15, value=8, step=1, label="Number of candidates")
170
+ max_len = gr.Slider(30, 120, value=60, step=1, label="Max length (new tokens)")
171
  temp = gr.Slider(0.5, 1.5, value=0.9, step=0.05, label="Temperature")
172
  scorer_model = gr.Dropdown(list(EMBEDDERS.keys()), value="MiniLM (fast)", label="Scorer embedding")
173