Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import os | |
import re | |
from groq import Groq | |
import plotly.graph_objs as go | |
from collections import defaultdict | |
from itertools import cycle | |
import json | |
from dotenv import load_dotenv | |
PERSONA_PATH = os.getenv("PERSONA_PATH", "/tmp/personas.json") | |
# Set HuggingFace cache directories to /tmp for cloud hosting (permission safe) | |
os.environ["TRANSFORMERS_CACHE"] = os.getenv("TRANSFORMERS_CACHE", "/tmp/hf_cache") | |
os.environ["HF_HOME"] = os.getenv("HF_HOME", "/tmp/huggingface") | |
os.environ["HF_DATASETS_CACHE"] = os.getenv("HF_DATASETS_CACHE", "/tmp/huggingface") | |
# --- THEME COLORS --- | |
neon_blue = "#00fff7" | |
neon_green = "#7CFC00" | |
neon_pink = "#F72585" | |
neon_yellow = "#FFF600" | |
neon_bg = "#181830" | |
neon_orange = "#FFB347" | |
neon_dark = "#202037" | |
load_dotenv() # load .env file | |
GROQ_API_KEY = os.environ.get("GROQ_API_KEY") | |
# --- CONFIG --- | |
GROQ_MODEL = "llama3-70b-8192" | |
groq_client = Groq(api_key=GROQ_API_KEY) | |
PRODUCT_CONTEXT = ( | |
"You are an AI market research expert analyzing customer reviews for a chocolate-flavoured whey protein powder. " | |
"Generate user personas based on patterns and diversity in the reviews." | |
) | |
CSV_PATH = "src/data_with_text.csv" | |
st.set_page_config(page_title="Persona Lab", layout="wide", initial_sidebar_state="collapsed") | |
st.markdown( | |
"<h1 style='color:#00fff7;font-size:2.6rem;font-weight:900;letter-spacing:0.01em;margin-bottom:5px;'>🎭 Persona Lab</h1>", | |
unsafe_allow_html=True | |
) | |
# Set dark theme programmatically | |
st.markdown( | |
""" | |
<style> | |
body, .main, .stApp { | |
background: #14151A !important; | |
color: #fff !important; | |
} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
st.markdown( | |
f""" | |
<div style="font-size:1.21rem; color:#AC7CFF; font-weight:600; margin-top:-13px; margin-bottom:14px; line-height:1.5;"> | |
Ready to peek inside the minds of your customers? | |
This is your sandbox for uncovering who buys, why they rave, and what they crave—powered by real reviews and sharp AI. | |
Dive in, explore the personas that drive your market, and see your brand through their eyes (and taste buds)! | |
</div> | |
""", | |
unsafe_allow_html=True | |
) | |
# --- NAVIGATION BUTTONS --- | |
st.markdown(""" | |
<style> | |
.neon-btn { | |
display:inline-block; | |
font-weight:bold; | |
padding:14px 32px; | |
border:none; | |
border-radius:12px; | |
font-size:1.1em; | |
margin-right:18px; | |
cursor:pointer; | |
box-shadow:0 0 14px #00fff777; | |
color:#222 !important; | |
background:linear-gradient(90deg,#7CFC00,#00fff7); | |
text-decoration:none !important; | |
transition: transform 0.08s; | |
} | |
.neon-btn-pink { | |
background:linear-gradient(90deg,#F72585,#00fff7); | |
color:#fff !important; | |
box-shadow:0 0 14px #F7258577; | |
} | |
.neon-btn:hover { | |
transform:scale(1.04); | |
box-shadow:0 0 24px #00fff799; | |
} | |
.neon-btn-pink:hover { | |
box-shadow:0 0 24px #F7258599; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div style="display:flex;gap:2em;justify-content:flex-start;"> | |
<a href="/prt111" class="neon-btn"target="_self">🏠 Home</a> | |
<a href="/newprod" class="neon-btn neon-btn-pink"target="_self">🚀 New Product Launch</a> | |
</div> | |
<br> | |
""", unsafe_allow_html=True) | |
def block_markdown(text, color): | |
text = text.replace('\n', '<br>') | |
return ( | |
f'<div style="background:linear-gradient(90deg,{color}22,#181830 90%);' | |
f'padding:16px 22px;border-radius:16px;margin:10px 0 24px 0;' | |
f'font-weight:600;color:#fff;font-size:1.04em;line-height:1.6;box-shadow:0 2px 24px {color}19;">' | |
f'{text}</div>' | |
) | |
def load_reviews(csv_path): | |
if not os.path.exists(csv_path): | |
st.error(f"CSV file not found: {csv_path}") | |
return pd.DataFrame() | |
df = pd.read_csv(csv_path) | |
if "polarity" not in df.columns: | |
try: | |
from transformers import pipeline | |
sa = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") | |
df["polarity"] = df["review_text"].apply(lambda x: 1 if sa(x)[0]["label"] == "POSITIVE" else -1) | |
except Exception as e: | |
st.warning("Could not compute sentiment scores. All reviews set to neutral (0).") | |
df["polarity"] = 0 | |
if "review_length" not in df.columns: | |
df["review_length"] = df["review_text"].apply(lambda x: len(str(x).split())) | |
return df | |
def generate_personas(review_texts, n_personas=4): | |
prompt = ( | |
f"Read the following customer reviews for a chocolate-flavored whey protein powder. " | |
f"Based on the language, interests, and context, segment these users into {n_personas} distinct personas. " | |
"For each persona, provide:\n" | |
"1. Persona Name starting with emoji\n" | |
"2. A one-line summary\n" | |
"3. Five detailed bullet points describing their characteristics, needs, goals, or behaviors (each bullet should be specific and insightful, not generic).\n" | |
"Give the answer as a numbered list, one for each persona. Format:\n" | |
"1. [Emoji] Persona Name\nSummary: ...\n- ...\n- ...\n- ...\n- ...\n- ...\n" | |
"\nREVIEWS:\n" + | |
"\n".join(review_texts[:120])[:3600] | |
) | |
try: | |
chat_completion = groq_client.chat.completions.create( | |
model=GROQ_MODEL, | |
messages=[ | |
{"role": "system", "content": PRODUCT_CONTEXT}, | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=900, | |
temperature=0.6, | |
) | |
return chat_completion.choices[0].message.content.strip() | |
except Exception as e: | |
return f"Error generating personas: {e}" | |
def parse_personas_bulletproof(llm_output, n=4): | |
lines = llm_output.splitlines() | |
persona_headers = [] | |
for i, line in enumerate(lines): | |
if re.match(r"^([0-9]{1,2}[.)-]?\s*)?[\U0001F300-\U0001FAFF]", line.strip()): | |
persona_headers.append(i) | |
persona_blocks = [] | |
for idx, start in enumerate(persona_headers): | |
end = persona_headers[idx+1] if idx+1 < len(persona_headers) else len(lines) | |
persona_blocks.append(lines[start:end]) | |
personas = [] | |
for block in persona_blocks[:n]: | |
name_line = re.sub(r"^([0-9]{1,2}[.)-]?\s*)?", "", block[0]).strip().replace("**", "") | |
summary = "" | |
bullets = [] | |
for l in block[1:]: | |
l = l.strip() | |
if not l: continue | |
if not summary and ("summary" in l.lower() or not l.startswith(("-", "•", "*", "+"))): | |
summary = re.sub(r"^summary[:\- ]*", "", l, flags=re.I) | |
elif l.startswith(("-", "•", "*", "+")) or re.match(r"^[0-9]{1,2}[.)-]", l): | |
b = re.sub(r"^[-•*+0-9. ]+", "", l) | |
if b: bullets.append(b) | |
personas.append({ | |
"name": name_line, | |
"summary": summary, | |
"bullets": bullets[:5] | |
}) | |
return personas | |
def assign_review_to_persona_tfidf(df, persona_defs): | |
# Use TF-IDF cosine similarity for assignment (faster than LLM for large data) | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
persona_texts = [p["summary"] + " " + " ".join(p["bullets"]) for p in persona_defs] | |
tfidf = TfidfVectorizer(stop_words='english') | |
X = tfidf.fit_transform(df["review_text"].tolist() + persona_texts) | |
review_vecs = X[:-len(persona_texts)] | |
persona_vecs = X[-len(persona_texts):] | |
assignments = [] | |
for i in range(review_vecs.shape[0]): | |
sims = review_vecs[i].dot(persona_vecs.T).toarray().flatten() | |
idx = np.argmax(sims) | |
assignments.append(persona_defs[idx]["name"]) | |
return assignments | |
def groq_bullets_persona(chart_desc, chart_data_text): | |
user_prompt = ( | |
f"Summarize as exactly two bullet points the main insights for this chart: {chart_desc}. " | |
f"Here is the data: {chart_data_text}. " | |
"Provide a percentage if applicable. Just facts." | |
) | |
try: | |
chat_completion = groq_client.chat.completions.create( | |
model=GROQ_MODEL, | |
messages=[ | |
{"role": "system", "content": PRODUCT_CONTEXT}, | |
{"role": "user", "content": user_prompt} | |
], | |
max_tokens=80, | |
temperature=0.5, | |
) | |
bullets = chat_completion.choices[0].message.content.strip() | |
points = [line for line in bullets.splitlines() if line.strip().startswith(("-", "•"))] | |
return "\n".join(points[:2]) if len(points) >= 2 else "- " + bullets | |
except Exception: | |
return "- Summary not available.\n- (LLM error)" | |
# --- EMOTION PIPELINE (optional) --- | |
def emotion_pipeline(df): | |
try: | |
from transformers import pipeline | |
emo = pipeline( | |
"text-classification", | |
model="finiteautomata/bertweet-base-emotion-analysis", # much smaller than roberta-base! | |
top_k=None, | |
device=-1 # always use CPU, avoid meta-tensor bug | |
) | |
except Exception as e: | |
st.warning(f"Could not load emotion model, skipping emotion analysis: {e}") | |
df["main_emotion"] = "neutral" | |
return df | |
all_emotions = [] | |
for t in df["review_text"]: | |
try: | |
emotions = emo(t[:512]) | |
if isinstance(emotions, list) and len(emotions) and isinstance(emotions[0], list): | |
# Sometimes returns list of lists | |
emotions = emotions[0] | |
main_emo = sorted(emotions, key=lambda x: -x["score"])[0]["label"] | |
except Exception: | |
main_emo = "neutral" | |
all_emotions.append(main_emo) | |
df["main_emotion"] = all_emotions | |
return df | |
# ========== MAIN PIPELINE ========== # | |
with st.spinner("🔎 Analyzing your data... Please wait a few moments."): | |
df = load_reviews(CSV_PATH) | |
reviews = df["review_text"].dropna().tolist() if not df.empty else [] | |
reviews = [t for t in reviews if "unreadable" not in t and "missing" not in t and t.strip()] | |
if reviews: | |
personas_raw = generate_personas(reviews, 4) | |
personas = parse_personas_bulletproof(personas_raw, 4) | |
if personas: | |
with open(PERSONA_PATH, "w", encoding="utf-8") as f: | |
json.dump(personas, f, ensure_ascii=False, indent=2) | |
st.session_state['personas'] = personas | |
st.success(f"{len(personas)} personas saved for next use.") | |
else: | |
personas = [] | |
persona_colors = [neon_green, neon_blue, neon_pink, neon_orange] | |
persona_cycler = cycle(persona_colors) | |
persona_blocks = [] | |
persona_names = [] | |
# Persona grid (left-right) | |
if personas: | |
st.markdown("<br>", unsafe_allow_html=True) | |
grid_cols = st.columns(2) | |
for i, p in enumerate(personas): | |
c = next(persona_cycler) | |
col = grid_cols[i%2] | |
with col: | |
st.markdown( | |
f"<div style='background:linear-gradient(90deg,{c}18,#181830 95%);" | |
"padding:24px 26px 16px 26px;border-radius:18px;margin-bottom:24px;" | |
f"box-shadow:0 2px 22px {c}22;'>" | |
f"<h2 style='color:{c};margin-bottom:0.18em'>{p['name']}</h2>" | |
f"<div style='color:#fff;font-size:1.15em;font-weight:500;margin-bottom:10px'>Summary: {p['summary']}</div>" | |
f"<div style='color:{neon_pink};font-weight:700;font-size:1.08em;margin-bottom:2px'>Characteristics</div>" | |
f"<ul style='font-size:1.02em;margin-top:3px'>{''.join([f'<li>{b}</li>' for b in p['bullets']])}</ul>" | |
"</div>", unsafe_allow_html=True | |
) | |
persona_names.append(p["name"]) | |
st.markdown("<hr>", unsafe_allow_html=True) | |
if personas and len(reviews) > 0: | |
# Assign reviews to persona via TF-IDF (fast) | |
persona_for_review = assign_review_to_persona_tfidf(df, personas) | |
df_reviews = df.copy() | |
df_reviews = df_reviews.iloc[:len(persona_for_review)].copy() | |
df_reviews["persona"] = persona_for_review | |
# --- Generate all summary stats for new graphs | |
# 1. Persona Review Share | |
persona_counts = df_reviews["persona"].value_counts() | |
# 2. Persona Sentiment | |
avg_sentiment = df_reviews.groupby("persona")["polarity"].mean() | |
# 3. Persona Review Length | |
avg_length = df_reviews.groupby("persona")["review_length"].mean() | |
# 4. Persona Emotion (optional) | |
if "main_emotion" not in df_reviews.columns: | |
df_reviews = emotion_pipeline(df_reviews) | |
emo_dist = df_reviews.groupby("persona")["main_emotion"].value_counts().unstack().fillna(0) | |
# --- Row 1: Pie and Sentiment Bar | |
c1, c2 = st.columns(2) | |
with c1: | |
st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Sales/Review Share by Persona</h3>", unsafe_allow_html=True) | |
fig = go.Figure(data=[go.Pie(labels=persona_counts.index, values=persona_counts.values, hole=0.45)]) | |
fig.update_traces(textinfo='percent+label') | |
st.plotly_chart(fig, use_container_width=True) | |
st.markdown(block_markdown( | |
groq_bullets_persona("Sales/Review Share by Persona", persona_counts.to_dict()), neon_green | |
), unsafe_allow_html=True) | |
with c2: | |
st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Average Sentiment by Persona</h3>", unsafe_allow_html=True) | |
fig2 = go.Figure(data=[go.Bar(x=avg_sentiment.index, y=avg_sentiment.values, marker=dict(color=[neon_green, neon_blue, neon_pink, neon_orange]))]) | |
fig2.update_layout(xaxis_title="Persona", yaxis_title="Avg Sentiment", font=dict(size=15)) | |
st.plotly_chart(fig2, use_container_width=True) | |
st.markdown(block_markdown( | |
groq_bullets_persona("Average Sentiment by Persona", avg_sentiment.to_dict()), neon_blue | |
), unsafe_allow_html=True) | |
# --- Row 2: Review Length and Emotion Distribution | |
c3, c4 = st.columns(2) | |
with c3: | |
st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Persona vs. Review Length Distribution</h3>", unsafe_allow_html=True) | |
fig3 = go.Figure(data=[go.Bar(x=avg_length.index, y=avg_length.values, marker=dict(color=[neon_green, neon_blue, neon_pink, neon_orange]))]) | |
fig3.update_layout(xaxis_title="Persona", yaxis_title="Avg Review Length", font=dict(size=15)) | |
st.plotly_chart(fig3, use_container_width=True) | |
st.markdown(block_markdown( | |
groq_bullets_persona("Average review length (words) by persona", avg_length.to_dict()), neon_orange | |
), unsafe_allow_html=True) | |
with c4: | |
st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Persona vs. Emotion Distribution</h3>", unsafe_allow_html=True) | |
fig4 = go.Figure() | |
for idx, em in enumerate(emo_dist.columns): | |
fig4.add_trace(go.Bar(name=em, x=emo_dist.index, y=emo_dist[em].values)) | |
fig4.update_layout(barmode='stack', xaxis_title="Persona", yaxis_title="Emotion Count", font=dict(size=15)) | |
st.plotly_chart(fig4, use_container_width=True) | |
st.markdown(block_markdown( | |
groq_bullets_persona("Distribution of primary emotions per persona", emo_dist.to_dict()), neon_pink | |
), unsafe_allow_html=True) | |
# --- Persona-wise Highlights, grouped by persona with headings --- | |
st.markdown("<hr><h2 style='color:#fff'>Persona-wise Sentiment Highlights & Recommendations</h2>", unsafe_allow_html=True) | |
persona_grid = st.columns(2) | |
for idx, p in enumerate(personas): | |
persona_df = df_reviews[df_reviews["persona"] == p["name"]] | |
top_pos = persona_df[persona_df["polarity"] > 0]["review_text"].head(2).tolist() | |
top_neg = persona_df[persona_df["polarity"] < 0]["review_text"].head(2).tolist() | |
pos_summary = groq_bullets_persona( | |
f"Summarize two main positive sentiment points, with percentage, for persona '{p['name']}'.", | |
" ".join(top_pos) | |
) if top_pos else "No positive reviews." | |
neg_summary = groq_bullets_persona( | |
f"Summarize two main negative sentiment points, with percentage, for persona '{p['name']}'.", | |
" ".join(top_neg) | |
) if top_neg else "No negative reviews." | |
rec_prompt = ( | |
f"You are a product marketing strategist. " | |
f"Based on the review highlights and persona details for '{p['name']}' " | |
f"(do not repeat the characteristics), write one concise or mention name of user, actionable product or marketing recommendation. Dont put * anywhere " | |
f"for the company to better engage this persona. " | |
f"Focus on practical actions the business can take (such as messaging, offers, features, or campaigns). " | |
f"Reply with 1-2 sentences, avoid restating the persona’s traits." | |
) | |
try: | |
rec_out = groq_client.chat.completions.create( | |
model=GROQ_MODEL, | |
messages=[ | |
{"role": "system", "content": PRODUCT_CONTEXT}, | |
{"role": "user", "content": rec_prompt} | |
], | |
max_tokens=80, temperature=0.5 | |
).choices[0].message.content.strip() | |
except: | |
rec_out = "No recommendation available." | |
with persona_grid[idx % 2]: | |
st.markdown( | |
f"<div style='margin-bottom:38px;padding:18px 20px 8px 20px;border-radius:18px;" | |
f"background:linear-gradient(90deg,{persona_colors[idx%4]}22,#181830 100%);box-shadow:0 2px 22px {persona_colors[idx%4]}18;'>" | |
f"<h2 style='color:{persona_colors[idx%4]};font-size:1.35em;margin-bottom:0.3em'>{p['name']}</h2>" | |
f"<div style='color:#fff;font-size:1.13em;font-weight:400;margin-bottom:14px;'>{p['summary']}</div>" | |
"<div style='margin-bottom:16px'>" | |
f"<b style='color:{neon_green};font-size:1.1em;'>Top Positive Sentiments:</b><br>{block_markdown(pos_summary, neon_green)}" | |
"</div>" | |
"<div style='margin-bottom:16px'>" | |
f"<b style='color:{neon_pink};font-size:1.1em;'>Top Negative Sentiments:</b><br>{block_markdown(neg_summary, neon_pink)}" | |
"</div>" | |
"<div>" | |
f"<b style='color:{neon_yellow};font-size:1.1em;'>Recommendation:</b><br>{block_markdown(rec_out, neon_yellow)}" | |
"</div>" | |
"</div>", unsafe_allow_html=True | |
) | |
st.markdown("---") | |