Spaces:
Sleeping
Sleeping
# Install datasets library | |
!pip install -U datasets | |
# Load Sentiment140 dataset | |
from datasets import load_dataset | |
dataset = load_dataset("sentiment140") | |
# Convert to pandas | |
import pandas as pd | |
df = dataset["train"].to_pandas() | |
df.head() | |
# Drop null values in text and sentiment | |
df.dropna(subset=["text", "sentiment"], inplace=True) | |
# Filter tweets with reasonable length | |
df["text_length"] = df["text"].apply(len) | |
df = df[(df["text_length"] >= 5) & (df["text_length"] <= 280)] | |
# Clean the text | |
import re | |
def clean_text(text): | |
text = text.lower() | |
text = re.sub(r"http\S+", "", text) | |
text = re.sub(r"@\w+", "", text) | |
text = re.sub(r"#\w+", "", text) | |
text = re.sub(r"[^\w\s]", "", text) | |
text = re.sub(r"\s+", " ", text).strip() | |
return text | |
df["clean_text"] = df["text"].apply(clean_text) | |
df[["text", "clean_text"]].head() | |
# Convert sentiment labels from numbers to text | |
def map_sentiment(label): | |
return "negative" if label == 0 else "neutral" if label == 2 else "positive" | |
df["sentiment_label"] = df["sentiment"].apply(map_sentiment) | |
df["sentiment_label"].value_counts() | |
# Save for future use | |
df[["clean_text", "sentiment_label"]].to_csv("cleaned_sentiment140.csv", index=False) | |
print("Cleaned data saved!") | |
!pip install -U sentence-transformers | |
from sentence_transformers import SentenceTransformer | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Use a small sample for speed (feel free to increase) | |
sample_df = df.sample(5000, random_state=42).reset_index(drop=True) | |
texts = sample_df["clean_text"].tolist() | |
# Load 3 different embedding models | |
models = { | |
"MiniLM": SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"), | |
"MPNet": SentenceTransformer("sentence-transformers/all-mpnet-base-v2"), | |
"DistilRoBERTa": SentenceTransformer("sentence-transformers/paraphrase-distilroberta-base-v1") | |
} | |
# Compute and compare similarity for one test input | |
test_input = "I am so happy with this product" | |
def get_top3_similarities(model, texts, test_input): | |
text_embeddings = model.encode(texts, show_progress_bar=True) | |
input_embedding = model.encode([test_input]) | |
similarities = cosine_similarity(input_embedding, text_embeddings)[0] | |
top_indices = similarities.argsort()[-3:][::-1] | |
return [(i, texts[i], similarities[i]) for i in top_indices] | |
# Try each model | |
results = {} | |
for name, model in models.items(): | |
print(f"\nπ Top 3 results from: {name}") | |
top3 = get_top3_similarities(model, texts, test_input) | |
for rank, (idx, text, score) in enumerate(top3, start=1): | |
print(f"{rank}. [{score:.4f}] {text}") | |
results[name] = top3 | |
!pip install -U transformers | |
from transformers import pipeline, set_seed | |
# Load small GPT-2 model for text generation | |
generator = pipeline("text-generation", model="distilgpt2") | |
set_seed(42) # reproducible results | |
# Example user input | |
test_input = "I'm feeling amazing about our product launch!" | |
# Generate synthetic tweets | |
synthetic_outputs = generator( | |
test_input, | |
max_length=50, | |
num_return_sequences=10, | |
do_sample=True, | |
temperature=0.9 | |
) | |
# Extract just the generated text | |
generated_tweets = [output["generated_text"].strip() for output in synthetic_outputs] | |
for i, tweet in enumerate(generated_tweets, 1): | |
print(f"{i}. {tweet}\n") | |
from sentence_transformers import SentenceTransformer | |
# Load your best model again (MiniLM is a good choice) | |
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
# Embed input and generated tweets | |
input_vec = embedding_model.encode([test_input]) | |
gen_vecs = embedding_model.encode(generated_tweets) | |
# Compute similarity and select best | |
from sklearn.metrics.pairwise import cosine_similarity | |
similarities = cosine_similarity(input_vec, gen_vecs)[0] | |
top_index = similarities.argmax() | |
best_generated = generated_tweets[top_index] | |
print(f"β Best AI-generated tweet:\n[{similarities[top_index]:.4f}] {best_generated}") | |