Salimtoama15 commited on
Commit
ab3f10f
·
verified ·
1 Parent(s): 5d0393a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -97
app.py CHANGED
@@ -1,24 +1,15 @@
1
- # Install datasets library
 
 
 
2
 
 
 
 
 
3
 
4
- # Load Sentiment140 dataset
5
- from datasets import load_dataset
6
- dataset = load_dataset("sentiment140")
7
-
8
- # Convert to pandas
9
- import pandas as pd
10
- df = dataset["train"].to_pandas()
11
- df.head()
12
- # Drop null values in text and sentiment
13
- df.dropna(subset=["text", "sentiment"], inplace=True)
14
-
15
- # Filter tweets with reasonable length
16
- df["text_length"] = df["text"].apply(len)
17
- df = df[(df["text_length"] >= 5) & (df["text_length"] <= 280)]
18
-
19
- # Clean the text
20
- import re
21
- def clean_text(text):
22
  text = text.lower()
23
  text = re.sub(r"http\S+", "", text)
24
  text = re.sub(r"@\w+", "", text)
@@ -27,83 +18,140 @@ def clean_text(text):
27
  text = re.sub(r"\s+", " ", text).strip()
28
  return text
29
 
30
- df["clean_text"] = df["text"].apply(clean_text)
31
- df[["text", "clean_text"]].head()
32
- # Convert sentiment labels from numbers to text
33
- def map_sentiment(label):
34
- return "negative" if label == 0 else "neutral" if label == 2 else "positive"
35
-
36
- df["sentiment_label"] = df["sentiment"].apply(map_sentiment)
37
- df["sentiment_label"].value_counts()
38
- # Save for future use
39
- df[["clean_text", "sentiment_label"]].to_csv("cleaned_sentiment140.csv", index=False)
40
- print("Cleaned data saved!")
41
- !pip install -U sentence-transformers
42
- from sentence_transformers import SentenceTransformer
43
- import numpy as np
44
- from sklearn.metrics.pairwise import cosine_similarity
45
-
46
- # Use a small sample for speed (feel free to increase)
47
- sample_df = df.sample(5000, random_state=42).reset_index(drop=True)
48
- texts = sample_df["clean_text"].tolist()
49
-
50
- # Load 3 different embedding models
51
- models = {
52
- "MiniLM": SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
53
- "MPNet": SentenceTransformer("sentence-transformers/all-mpnet-base-v2"),
54
- "DistilRoBERTa": SentenceTransformer("sentence-transformers/paraphrase-distilroberta-base-v1")
 
 
 
 
55
  }
56
- # Compute and compare similarity for one test input
57
- test_input = "I am so happy with this product"
58
-
59
- def get_top3_similarities(model, texts, test_input):
60
- text_embeddings = model.encode(texts, show_progress_bar=True)
61
- input_embedding = model.encode([test_input])
62
- similarities = cosine_similarity(input_embedding, text_embeddings)[0]
63
- top_indices = similarities.argsort()[-3:][::-1]
64
- return [(i, texts[i], similarities[i]) for i in top_indices]
65
-
66
- # Try each model
67
- results = {}
68
- for name, model in models.items():
69
- print(f"\n🔎 Top 3 results from: {name}")
70
- top3 = get_top3_similarities(model, texts, test_input)
71
- for rank, (idx, text, score) in enumerate(top3, start=1):
72
- print(f"{rank}. [{score:.4f}] {text}")
73
- results[name] = top3
74
- !pip install -U transformers
75
- from transformers import pipeline, set_seed
76
-
77
- # Load small GPT-2 model for text generation
78
- generator = pipeline("text-generation", model="distilgpt2")
79
- set_seed(42) # reproducible results
80
- # Example user input
81
- test_input = "I'm feeling amazing about our product launch!"
82
- # Generate synthetic tweets
83
- synthetic_outputs = generator(
84
- test_input,
85
- max_length=50,
86
- num_return_sequences=10,
87
- do_sample=True,
88
- temperature=0.9
89
- )
90
-
91
- # Extract just the generated text
92
- generated_tweets = [output["generated_text"].strip() for output in synthetic_outputs]
93
- for i, tweet in enumerate(generated_tweets, 1):
94
- print(f"{i}. {tweet}\n")
95
- from sentence_transformers import SentenceTransformer
96
-
97
- # Load your best model again (MiniLM is a good choice)
98
- embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
99
- # Embed input and generated tweets
100
- input_vec = embedding_model.encode([test_input])
101
- gen_vecs = embedding_model.encode(generated_tweets)
102
-
103
- # Compute similarity and select best
104
- from sklearn.metrics.pairwise import cosine_similarity
105
- similarities = cosine_similarity(input_vec, gen_vecs)[0]
106
- top_index = similarities.argmax()
107
- best_generated = generated_tweets[top_index]
108
 
109
- print(f"✅ Best AI-generated tweet:\n[{similarities[top_index]:.4f}] {best_generated}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, re, functools, numpy as np, pandas as pd
2
+ import gradio as gr
3
+ from datasets import load_dataset
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
 
6
+ # -------- Config (safe defaults for CPU Spaces) --------
7
+ SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "5000"))
8
+ RANDOM_STATE = 42
9
+ DEFAULT_INPUT = "I am so happy with this product"
10
 
11
+ # -------- Text cleaning (yours) --------
12
+ def clean_text(text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  text = text.lower()
14
  text = re.sub(r"http\S+", "", text)
15
  text = re.sub(r"@\w+", "", text)
 
18
  text = re.sub(r"\s+", " ", text).strip()
19
  return text
20
 
21
+ # -------- Load sample data once --------
22
+ @functools.lru_cache(maxsize=1)
23
+ def load_sample_df():
24
+ ds = load_dataset("sentiment140", split="train")
25
+ df = ds.to_pandas()
26
+ df = df.dropna(subset=["text", "sentiment"]).copy()
27
+ df["text_length"] = df["text"].str.len()
28
+ df = df[(df["text_length"] >= 5) & (df["text_length"] <= 280)].copy()
29
+ df["clean_text"] = df["text"].apply(clean_text)
30
+ df = df.sample(min(SAMPLE_SIZE, len(df)), random_state=RANDOM_STATE).reset_index(drop=True)
31
+ return df[["text", "clean_text"]]
32
+
33
+ # -------- Lazy model loaders --------
34
+ @functools.lru_cache(maxsize=None)
35
+ def load_sentence_model(model_id: str):
36
+ from sentence_transformers import SentenceTransformer
37
+ return SentenceTransformer(model_id)
38
+
39
+ @functools.lru_cache(maxsize=None)
40
+ def load_generator():
41
+ from transformers import pipeline, set_seed
42
+ set_seed(RANDOM_STATE)
43
+ return pipeline("text-generation", model="distilgpt2")
44
+
45
+ # Map names → HF ids
46
+ EMBEDDERS = {
47
+ "MiniLM (fast)": "sentence-transformers/all-MiniLM-L6-v2",
48
+ "MPNet (heavier)": "sentence-transformers/all-mpnet-base-v2",
49
+ "DistilRoBERTa (paraphrase)": "sentence-transformers/paraphrase-distilroberta-base-v1",
50
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ # Cache for precomputed corpus embeddings per model
53
+ _CORPUS_CACHE = {}
54
+
55
+ def ensure_corpus_embeddings(model_name: str, texts: list[str]):
56
+ """Compute & cache corpus embeddings for a given model name."""
57
+ if model_name in _CORPUS_CACHE:
58
+ return _CORPUS_CACHE[model_name]
59
+ model_id = EMBEDDERS[model_name]
60
+ model = load_sentence_model(model_id)
61
+ # encode with no progress bar to keep logs clean on Spaces
62
+ emb = model.encode(texts, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
63
+ _CORPUS_CACHE[model_name] = emb
64
+ return emb
65
+
66
+ def top3_for_each_model(user_input: str, selected_models: list[str]):
67
+ df = load_sample_df()
68
+ texts = df["clean_text"].tolist()
69
+ rows = []
70
+ for name in selected_models:
71
+ try:
72
+ model = load_sentence_model(EMBEDDERS[name])
73
+ corpus_emb = ensure_corpus_embeddings(name, texts)
74
+ q = model.encode([clean_text(user_input)], show_progress_bar=False, normalize_embeddings=True)
75
+ sims = cosine_similarity(q, corpus_emb)[0]
76
+ top_idx = sims.argsort()[-3:][::-1]
77
+ for rank, i in enumerate(top_idx, start=1):
78
+ rows.append({
79
+ "Model": name,
80
+ "Rank": rank,
81
+ "Similarity": float(sims[i]),
82
+ "Tweet (clean)": texts[i],
83
+ "Tweet (orig)": df.loc[i, "text"]
84
+ })
85
+ except Exception as e:
86
+ rows.append({"Model": name, "Rank": "-", "Similarity": "-", "Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""})
87
+ out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
88
+ return out
89
+
90
+ def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int, temperature: float, scorer_model_name: str):
91
+ gen = load_generator()
92
+ outputs = gen(prompt, max_length=max_length, num_return_sequences=n_sequences, do_sample=True, temperature=temperature)
93
+ candidates = [o["generated_text"].strip() for o in outputs]
94
+
95
+ scorer_id = EMBEDDERS[scorer_model_name]
96
+ scorer = load_sentence_model(scorer_id)
97
+ q = scorer.encode([prompt], show_progress_bar=False, normalize_embeddings=True)
98
+ cand_vecs = scorer.encode(candidates, show_progress_bar=False, normalize_embeddings=True)
99
+ sims = cosine_similarity(q, cand_vecs)[0]
100
+ best_idx = int(sims.argmax())
101
+ table = pd.DataFrame({
102
+ "Rank": np.argsort(-sims)+1,
103
+ "Similarity": np.sort(sims)[::-1],
104
+ "Generated Tweet": [c for _, c in sorted(zip(-sims, candidates))]
105
+ })
106
+ best = candidates[best_idx]
107
+ best_score = float(sims[best_idx])
108
+ return best, best_score, table
109
+
110
+ with gr.Blocks(title="Sentiment140 Embeddings + Generation") as demo:
111
+ gr.Markdown(
112
+ """
113
+ # 🧪 Sentiment140 — Embeddings & Tweet Generator
114
+ Small, reliable demo for your final project:
115
+ 1) Compare top-3 most similar tweets from **Sentiment140** across embedding models.
116
+ 2) Generate synthetic tweets with **DistilGPT‑2** and auto‑pick the best by semantic similarity.
117
+
118
+ > Tip: Start with **MiniLM (fast)** on CPU Spaces. Add MPNet/DistilRoBERTa if you have a GPU.
119
+ """
120
+ )
121
+
122
+ with gr.Row():
123
+ test_input = gr.Textbox(label="Your input", value=DEFAULT_INPUT, lines=2)
124
+ models = gr.CheckboxGroup(
125
+ choices=list(EMBEDDERS.keys()),
126
+ value=["MiniLM (fast)"],
127
+ label="Embedding models to compare"
128
+ )
129
+
130
+ run_btn = gr.Button("🔎 Find Top‑3 Similar Tweets")
131
+ table_out = gr.Dataframe(interactive=False, wrap=True)
132
+
133
+ run_btn.click(top3_for_each_model, inputs=[test_input, models], outputs=table_out)
134
+
135
+ gr.Markdown("---")
136
+ gr.Markdown("## 📝 Generate Tweets and Pick the Best (by similarity to your input)")
137
+
138
+ with gr.Row():
139
+ n_seq = gr.Slider(3, 15, value=8, step=1, label="Number of candidates")
140
+ max_len = gr.Slider(30, 120, value=60, step=1, label="Max length")
141
+ temp = gr.Slider(0.5, 1.5, value=0.9, step=0.05, label="Temperature")
142
+ scorer_model = gr.Dropdown(list(EMBEDDERS.keys()), value="MiniLM (fast)", label="Scorer embedding")
143
+
144
+ gen_btn = gr.Button("✨ Generate & Score")
145
+ best_txt = gr.Textbox(label="Best generated tweet")
146
+ best_score = gr.Number(label="Similarity (best)")
147
+ gen_table = gr.Dataframe(interactive=False, wrap=True)
148
+
149
+ gen_btn.click(generate_and_pick_best,
150
+ inputs=[test_input, n_seq, max_len, temp, scorer_model],
151
+ outputs=[best_txt, best_score, gen_table])
152
+
153
+ gr.Markdown("---")
154
+ gr.Markdown("## 🖼️ Project Photo (optional, just to display it in the app)")
155
+ photo = gr.Image(label="Upload your project photo (jpg/png)", type="filepath")
156
+
157
+ demo.queue(max_size=32).launch()