Salimtoama15 commited on
Commit
91cd368
·
verified ·
1 Parent(s): dcb0b33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -5
app.py CHANGED
@@ -1,8 +1,109 @@
1
- import gradio as gr
 
2
 
3
- def greet(name):
4
- return f"Hello {name}!"
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
 
 
7
 
8
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install datasets library
2
+ !pip install -U datasets
3
 
4
+ # Load Sentiment140 dataset
5
+ from datasets import load_dataset
6
+ dataset = load_dataset("sentiment140")
7
 
8
+ # Convert to pandas
9
+ import pandas as pd
10
+ df = dataset["train"].to_pandas()
11
+ df.head()
12
+ # Drop null values in text and sentiment
13
+ df.dropna(subset=["text", "sentiment"], inplace=True)
14
 
15
+ # Filter tweets with reasonable length
16
+ df["text_length"] = df["text"].apply(len)
17
+ df = df[(df["text_length"] >= 5) & (df["text_length"] <= 280)]
18
+
19
+ # Clean the text
20
+ import re
21
+ def clean_text(text):
22
+ text = text.lower()
23
+ text = re.sub(r"http\S+", "", text)
24
+ text = re.sub(r"@\w+", "", text)
25
+ text = re.sub(r"#\w+", "", text)
26
+ text = re.sub(r"[^\w\s]", "", text)
27
+ text = re.sub(r"\s+", " ", text).strip()
28
+ return text
29
+
30
+ df["clean_text"] = df["text"].apply(clean_text)
31
+ df[["text", "clean_text"]].head()
32
+ # Convert sentiment labels from numbers to text
33
+ def map_sentiment(label):
34
+ return "negative" if label == 0 else "neutral" if label == 2 else "positive"
35
+
36
+ df["sentiment_label"] = df["sentiment"].apply(map_sentiment)
37
+ df["sentiment_label"].value_counts()
38
+ # Save for future use
39
+ df[["clean_text", "sentiment_label"]].to_csv("cleaned_sentiment140.csv", index=False)
40
+ print("Cleaned data saved!")
41
+ !pip install -U sentence-transformers
42
+ from sentence_transformers import SentenceTransformer
43
+ import numpy as np
44
+ from sklearn.metrics.pairwise import cosine_similarity
45
+
46
+ # Use a small sample for speed (feel free to increase)
47
+ sample_df = df.sample(5000, random_state=42).reset_index(drop=True)
48
+ texts = sample_df["clean_text"].tolist()
49
+
50
+ # Load 3 different embedding models
51
+ models = {
52
+ "MiniLM": SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
53
+ "MPNet": SentenceTransformer("sentence-transformers/all-mpnet-base-v2"),
54
+ "DistilRoBERTa": SentenceTransformer("sentence-transformers/paraphrase-distilroberta-base-v1")
55
+ }
56
+ # Compute and compare similarity for one test input
57
+ test_input = "I am so happy with this product"
58
+
59
+ def get_top3_similarities(model, texts, test_input):
60
+ text_embeddings = model.encode(texts, show_progress_bar=True)
61
+ input_embedding = model.encode([test_input])
62
+ similarities = cosine_similarity(input_embedding, text_embeddings)[0]
63
+ top_indices = similarities.argsort()[-3:][::-1]
64
+ return [(i, texts[i], similarities[i]) for i in top_indices]
65
+
66
+ # Try each model
67
+ results = {}
68
+ for name, model in models.items():
69
+ print(f"\n🔎 Top 3 results from: {name}")
70
+ top3 = get_top3_similarities(model, texts, test_input)
71
+ for rank, (idx, text, score) in enumerate(top3, start=1):
72
+ print(f"{rank}. [{score:.4f}] {text}")
73
+ results[name] = top3
74
+ !pip install -U transformers
75
+ from transformers import pipeline, set_seed
76
+
77
+ # Load small GPT-2 model for text generation
78
+ generator = pipeline("text-generation", model="distilgpt2")
79
+ set_seed(42) # reproducible results
80
+ # Example user input
81
+ test_input = "I'm feeling amazing about our product launch!"
82
+ # Generate synthetic tweets
83
+ synthetic_outputs = generator(
84
+ test_input,
85
+ max_length=50,
86
+ num_return_sequences=10,
87
+ do_sample=True,
88
+ temperature=0.9
89
+ )
90
+
91
+ # Extract just the generated text
92
+ generated_tweets = [output["generated_text"].strip() for output in synthetic_outputs]
93
+ for i, tweet in enumerate(generated_tweets, 1):
94
+ print(f"{i}. {tweet}\n")
95
+ from sentence_transformers import SentenceTransformer
96
+
97
+ # Load your best model again (MiniLM is a good choice)
98
+ embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
99
+ # Embed input and generated tweets
100
+ input_vec = embedding_model.encode([test_input])
101
+ gen_vecs = embedding_model.encode(generated_tweets)
102
+
103
+ # Compute similarity and select best
104
+ from sklearn.metrics.pairwise import cosine_similarity
105
+ similarities = cosine_similarity(input_vec, gen_vecs)[0]
106
+ top_index = similarities.argmax()
107
+ best_generated = generated_tweets[top_index]
108
+
109
+ print(f"✅ Best AI-generated tweet:\n[{similarities[top_index]:.4f}] {best_generated}")