Salimtoama15 commited on
Commit
ae8ae88
·
verified ·
1 Parent(s): 3549594

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -92
app.py CHANGED
@@ -1,23 +1,25 @@
1
- # Install datasets library
2
- !pip install -U datasets
3
-
4
- # Load Sentiment140 dataset
5
- from datasets import load_dataset
6
- dataset = load_dataset("sentiment140")
7
-
8
- # Convert to pandas
9
  import pandas as pd
10
- df = dataset["train"].to_pandas()
11
- df.head()
12
- # Drop null values in text and sentiment
13
- df.dropna(subset=["text", "sentiment"], inplace=True)
 
 
14
 
15
- # Filter tweets with reasonable length
16
- df["text_length"] = df["text"].apply(len)
17
- df = df[(df["text_length"] >= 5) & (df["text_length"] <= 280)]
 
 
 
 
 
 
 
 
 
18
 
19
- # Clean the text
20
- import re
21
  def clean_text(text):
22
  text = text.lower()
23
  text = re.sub(r"http\S+", "", text)
@@ -27,83 +29,31 @@ def clean_text(text):
27
  text = re.sub(r"\s+", " ", text).strip()
28
  return text
29
 
30
- df["clean_text"] = df["text"].apply(clean_text)
31
- df[["text", "clean_text"]].head()
32
- # Convert sentiment labels from numbers to text
33
- def map_sentiment(label):
34
- return "negative" if label == 0 else "neutral" if label == 2 else "positive"
35
-
36
- df["sentiment_label"] = df["sentiment"].apply(map_sentiment)
37
- df["sentiment_label"].value_counts()
38
- # Save for future use
39
- df[["clean_text", "sentiment_label"]].to_csv("cleaned_sentiment140.csv", index=False)
40
- print("Cleaned data saved!")
41
- !pip install -U sentence-transformers
42
- from sentence_transformers import SentenceTransformer
43
- import numpy as np
44
- from sklearn.metrics.pairwise import cosine_similarity
45
-
46
- # Use a small sample for speed (feel free to increase)
47
  sample_df = df.sample(5000, random_state=42).reset_index(drop=True)
48
  texts = sample_df["clean_text"].tolist()
49
 
50
- # Load 3 different embedding models
51
- models = {
52
- "MiniLM": SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
53
- "MPNet": SentenceTransformer("sentence-transformers/all-mpnet-base-v2"),
54
- "DistilRoBERTa": SentenceTransformer("sentence-transformers/paraphrase-distilroberta-base-v1")
55
- }
56
- # Compute and compare similarity for one test input
57
- test_input = "I am so happy with this product"
58
-
59
- def get_top3_similarities(model, texts, test_input):
60
- text_embeddings = model.encode(texts, show_progress_bar=True)
61
- input_embedding = model.encode([test_input])
 
62
  similarities = cosine_similarity(input_embedding, text_embeddings)[0]
63
  top_indices = similarities.argsort()[-3:][::-1]
64
- return [(i, texts[i], similarities[i]) for i in top_indices]
65
-
66
- # Try each model
67
- results = {}
68
- for name, model in models.items():
69
- print(f"\n🔎 Top 3 results from: {name}")
70
- top3 = get_top3_similarities(model, texts, test_input)
71
- for rank, (idx, text, score) in enumerate(top3, start=1):
72
- print(f"{rank}. [{score:.4f}] {text}")
73
- results[name] = top3
74
- !pip install -U transformers
75
- from transformers import pipeline, set_seed
76
-
77
- # Load small GPT-2 model for text generation
78
- generator = pipeline("text-generation", model="distilgpt2")
79
- set_seed(42) # reproducible results
80
- # Example user input
81
- test_input = "I'm feeling amazing about our product launch!"
82
- # Generate synthetic tweets
83
- synthetic_outputs = generator(
84
- test_input,
85
- max_length=50,
86
- num_return_sequences=10,
87
- do_sample=True,
88
- temperature=0.9
89
- )
90
-
91
- # Extract just the generated text
92
- generated_tweets = [output["generated_text"].strip() for output in synthetic_outputs]
93
- for i, tweet in enumerate(generated_tweets, 1):
94
- print(f"{i}. {tweet}\n")
95
- from sentence_transformers import SentenceTransformer
96
-
97
- # Load your best model again (MiniLM is a good choice)
98
- embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
99
- # Embed input and generated tweets
100
- input_vec = embedding_model.encode([test_input])
101
- gen_vecs = embedding_model.encode(generated_tweets)
102
-
103
- # Compute similarity and select best
104
- from sklearn.metrics.pairwise import cosine_similarity
105
- similarities = cosine_similarity(input_vec, gen_vecs)[0]
106
- top_index = similarities.argmax()
107
- best_generated = generated_tweets[top_index]
108
-
109
- print(f"✅ Best AI-generated tweet:\n[{similarities[top_index]:.4f}] {best_generated}")
 
1
+ import gradio as gr
 
 
 
 
 
 
 
2
  import pandas as pd
3
+ import re
4
+ from datasets import load_dataset
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from transformers import pipeline, set_seed
8
+ import numpy as np
9
 
10
+ # -------------------------------
11
+ # 1. Load and clean dataset
12
+ # -------------------------------
13
+ @st.cache_resource
14
+ def load_and_prepare_data():
15
+ dataset = load_dataset("sentiment140")
16
+ df = dataset["train"].to_pandas()
17
+ df.dropna(subset=["text", "sentiment"], inplace=True)
18
+ df["text_length"] = df["text"].apply(len)
19
+ df = df[(df["text_length"] >= 5) & (df["text_length"] <= 280)]
20
+ df["clean_text"] = df["text"].apply(clean_text)
21
+ return df
22
 
 
 
23
  def clean_text(text):
24
  text = text.lower()
25
  text = re.sub(r"http\S+", "", text)
 
29
  text = re.sub(r"\s+", " ", text).strip()
30
  return text
31
 
32
+ # Load data once
33
+ df = load_and_prepare_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  sample_df = df.sample(5000, random_state=42).reset_index(drop=True)
35
  texts = sample_df["clean_text"].tolist()
36
 
37
+ # -------------------------------
38
+ # 2. Load models
39
+ # -------------------------------
40
+ embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
41
+ generator = pipeline("text-generation", model="distilgpt2")
42
+ set_seed(42)
43
+
44
+ # -------------------------------
45
+ # 3. Helper functions
46
+ # -------------------------------
47
+ def get_top3_similarities(text_input):
48
+ text_embeddings = embedding_model.encode(texts, show_progress_bar=False)
49
+ input_embedding = embedding_model.encode([text_input])
50
  similarities = cosine_similarity(input_embedding, text_embeddings)[0]
51
  top_indices = similarities.argsort()[-3:][::-1]
52
+ return [texts[i] for i in top_indices]
53
+
54
+ def generate_best_tweet(text_input):
55
+ synthetic_outputs = generator(
56
+ text_input,
57
+ max_length=50,
58
+ num_return_sequences=10,
59
+ do_samp_