File size: 3,948 Bytes
c5976ba
 
 
 
ae8ae88
c5976ba
dcb0b33
c5976ba
 
 
 
 
 
91cd368
c5976ba
 
 
 
 
 
91cd368
 
 
 
 
 
 
 
 
c5976ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91cd368
 
 
c5976ba
 
 
 
 
 
 
 
 
 
 
 
91cd368
 
c5976ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72c4880
c5976ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Install datasets library
!pip install -U datasets

# Load Sentiment140 dataset
from datasets import load_dataset
dataset = load_dataset("sentiment140")

# Convert to pandas
import pandas as pd
df = dataset["train"].to_pandas()
df.head()
# Drop null values in text and sentiment
df.dropna(subset=["text", "sentiment"], inplace=True)

# Filter tweets with reasonable length
df["text_length"] = df["text"].apply(len)
df = df[(df["text_length"] >= 5) & (df["text_length"] <= 280)]

# Clean the text
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["text"].apply(clean_text)
df[["text", "clean_text"]].head()
# Convert sentiment labels from numbers to text
def map_sentiment(label):
    return "negative" if label == 0 else "neutral" if label == 2 else "positive"

df["sentiment_label"] = df["sentiment"].apply(map_sentiment)
df["sentiment_label"].value_counts()
# Save for future use
df[["clean_text", "sentiment_label"]].to_csv("cleaned_sentiment140.csv", index=False)
print("Cleaned data saved!")
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Use a small sample for speed (feel free to increase)
sample_df = df.sample(5000, random_state=42).reset_index(drop=True)
texts = sample_df["clean_text"].tolist()

# Load 3 different embedding models
models = {
    "MiniLM": SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
    "MPNet": SentenceTransformer("sentence-transformers/all-mpnet-base-v2"),
    "DistilRoBERTa": SentenceTransformer("sentence-transformers/paraphrase-distilroberta-base-v1")
}
# Compute and compare similarity for one test input
test_input = "I am so happy with this product"

def get_top3_similarities(model, texts, test_input):
    text_embeddings = model.encode(texts, show_progress_bar=True)
    input_embedding = model.encode([test_input])
    similarities = cosine_similarity(input_embedding, text_embeddings)[0]
    top_indices = similarities.argsort()[-3:][::-1]
    return [(i, texts[i], similarities[i]) for i in top_indices]

# Try each model
results = {}
for name, model in models.items():
    print(f"\nπŸ”Ž Top 3 results from: {name}")
    top3 = get_top3_similarities(model, texts, test_input)
    for rank, (idx, text, score) in enumerate(top3, start=1):
        print(f"{rank}. [{score:.4f}] {text}")
    results[name] = top3
!pip install -U transformers
from transformers import pipeline, set_seed

# Load small GPT-2 model for text generation
generator = pipeline("text-generation", model="distilgpt2")
set_seed(42)  # reproducible results
# Example user input
test_input = "I'm feeling amazing about our product launch!"
# Generate synthetic tweets
synthetic_outputs = generator(
    test_input,
    max_length=50,
    num_return_sequences=10,
    do_sample=True,
    temperature=0.9
)

# Extract just the generated text
generated_tweets = [output["generated_text"].strip() for output in synthetic_outputs]
for i, tweet in enumerate(generated_tweets, 1):
    print(f"{i}. {tweet}\n")
from sentence_transformers import SentenceTransformer

# Load your best model again (MiniLM is a good choice)
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Embed input and generated tweets
input_vec = embedding_model.encode([test_input])
gen_vecs = embedding_model.encode(generated_tweets)

# Compute similarity and select best
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(input_vec, gen_vecs)[0]
top_index = similarities.argmax()
best_generated = generated_tweets[top_index]

print(f"βœ… Best AI-generated tweet:\n[{similarities[top_index]:.4f}] {best_generated}")