InsightLoop / src /pages /persona.py
Kushagra13's picture
Update src/pages/persona.py
d85164d verified
import streamlit as st
import pandas as pd
import numpy as np
import os
import re
from groq import Groq
import plotly.graph_objs as go
from collections import defaultdict
from itertools import cycle
import json
from dotenv import load_dotenv
PERSONA_PATH = os.getenv("PERSONA_PATH", "/tmp/personas.json")
# Set HuggingFace cache directories to /tmp for cloud hosting (permission safe)
os.environ["TRANSFORMERS_CACHE"] = os.getenv("TRANSFORMERS_CACHE", "/tmp/hf_cache")
os.environ["HF_HOME"] = os.getenv("HF_HOME", "/tmp/huggingface")
os.environ["HF_DATASETS_CACHE"] = os.getenv("HF_DATASETS_CACHE", "/tmp/huggingface")
# --- THEME COLORS ---
neon_blue = "#00fff7"
neon_green = "#7CFC00"
neon_pink = "#F72585"
neon_yellow = "#FFF600"
neon_bg = "#181830"
neon_orange = "#FFB347"
neon_dark = "#202037"
load_dotenv() # load .env file
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
# --- CONFIG ---
GROQ_MODEL = "llama3-70b-8192"
groq_client = Groq(api_key=GROQ_API_KEY)
PRODUCT_CONTEXT = (
"You are an AI market research expert analyzing customer reviews for a chocolate-flavoured whey protein powder. "
"Generate user personas based on patterns and diversity in the reviews."
)
CSV_PATH = "src/data_with_text.csv"
st.set_page_config(page_title="Persona Lab", layout="wide", initial_sidebar_state="collapsed")
st.markdown(
"<h1 style='color:#00fff7;font-size:2.6rem;font-weight:900;letter-spacing:0.01em;margin-bottom:5px;'>🎭 Persona Lab</h1>",
unsafe_allow_html=True
)
# Set dark theme programmatically
st.markdown(
"""
<style>
body, .main, .stApp {
background: #14151A !important;
color: #fff !important;
}
</style>
""",
unsafe_allow_html=True
)
st.markdown(
f"""
<div style="font-size:1.21rem; color:#AC7CFF; font-weight:600; margin-top:-13px; margin-bottom:14px; line-height:1.5;">
Ready to peek inside the minds of your customers?
This is your sandbox for uncovering who buys, why they rave, and what they crave—powered by real reviews and sharp AI.
Dive in, explore the personas that drive your market, and see your brand through their eyes (and taste buds)!
</div>
""",
unsafe_allow_html=True
)
# --- NAVIGATION BUTTONS ---
st.markdown("""
<style>
.neon-btn {
display:inline-block;
font-weight:bold;
padding:14px 32px;
border:none;
border-radius:12px;
font-size:1.1em;
margin-right:18px;
cursor:pointer;
box-shadow:0 0 14px #00fff777;
color:#222 !important;
background:linear-gradient(90deg,#7CFC00,#00fff7);
text-decoration:none !important;
transition: transform 0.08s;
}
.neon-btn-pink {
background:linear-gradient(90deg,#F72585,#00fff7);
color:#fff !important;
box-shadow:0 0 14px #F7258577;
}
.neon-btn:hover {
transform:scale(1.04);
box-shadow:0 0 24px #00fff799;
}
.neon-btn-pink:hover {
box-shadow:0 0 24px #F7258599;
}
</style>
""", unsafe_allow_html=True)
st.markdown("""
<div style="display:flex;gap:2em;justify-content:flex-start;">
<a href="/prt111" class="neon-btn"target="_self">🏠 Home</a>
<a href="/newprod" class="neon-btn neon-btn-pink"target="_self">🚀 New Product Launch</a>
</div>
<br>
""", unsafe_allow_html=True)
def block_markdown(text, color):
text = text.replace('\n', '<br>')
return (
f'<div style="background:linear-gradient(90deg,{color}22,#181830 90%);'
f'padding:16px 22px;border-radius:16px;margin:10px 0 24px 0;'
f'font-weight:600;color:#fff;font-size:1.04em;line-height:1.6;box-shadow:0 2px 24px {color}19;">'
f'{text}</div>'
)
@st.cache_data(show_spinner=True)
def load_reviews(csv_path):
if not os.path.exists(csv_path):
st.error(f"CSV file not found: {csv_path}")
return pd.DataFrame()
df = pd.read_csv(csv_path)
if "polarity" not in df.columns:
try:
from transformers import pipeline
sa = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
df["polarity"] = df["review_text"].apply(lambda x: 1 if sa(x)[0]["label"] == "POSITIVE" else -1)
except Exception as e:
st.warning("Could not compute sentiment scores. All reviews set to neutral (0).")
df["polarity"] = 0
if "review_length" not in df.columns:
df["review_length"] = df["review_text"].apply(lambda x: len(str(x).split()))
return df
def generate_personas(review_texts, n_personas=4):
prompt = (
f"Read the following customer reviews for a chocolate-flavored whey protein powder. "
f"Based on the language, interests, and context, segment these users into {n_personas} distinct personas. "
"For each persona, provide:\n"
"1. Persona Name starting with emoji\n"
"2. A one-line summary\n"
"3. Five detailed bullet points describing their characteristics, needs, goals, or behaviors (each bullet should be specific and insightful, not generic).\n"
"Give the answer as a numbered list, one for each persona. Format:\n"
"1. [Emoji] Persona Name\nSummary: ...\n- ...\n- ...\n- ...\n- ...\n- ...\n"
"\nREVIEWS:\n" +
"\n".join(review_texts[:120])[:3600]
)
try:
chat_completion = groq_client.chat.completions.create(
model=GROQ_MODEL,
messages=[
{"role": "system", "content": PRODUCT_CONTEXT},
{"role": "user", "content": prompt}
],
max_tokens=900,
temperature=0.6,
)
return chat_completion.choices[0].message.content.strip()
except Exception as e:
return f"Error generating personas: {e}"
def parse_personas_bulletproof(llm_output, n=4):
lines = llm_output.splitlines()
persona_headers = []
for i, line in enumerate(lines):
if re.match(r"^([0-9]{1,2}[.)-]?\s*)?[\U0001F300-\U0001FAFF]", line.strip()):
persona_headers.append(i)
persona_blocks = []
for idx, start in enumerate(persona_headers):
end = persona_headers[idx+1] if idx+1 < len(persona_headers) else len(lines)
persona_blocks.append(lines[start:end])
personas = []
for block in persona_blocks[:n]:
name_line = re.sub(r"^([0-9]{1,2}[.)-]?\s*)?", "", block[0]).strip().replace("**", "")
summary = ""
bullets = []
for l in block[1:]:
l = l.strip()
if not l: continue
if not summary and ("summary" in l.lower() or not l.startswith(("-", "•", "*", "+"))):
summary = re.sub(r"^summary[:\- ]*", "", l, flags=re.I)
elif l.startswith(("-", "•", "*", "+")) or re.match(r"^[0-9]{1,2}[.)-]", l):
b = re.sub(r"^[-•*+0-9. ]+", "", l)
if b: bullets.append(b)
personas.append({
"name": name_line,
"summary": summary,
"bullets": bullets[:5]
})
return personas
def assign_review_to_persona_tfidf(df, persona_defs):
# Use TF-IDF cosine similarity for assignment (faster than LLM for large data)
from sklearn.feature_extraction.text import TfidfVectorizer
persona_texts = [p["summary"] + " " + " ".join(p["bullets"]) for p in persona_defs]
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(df["review_text"].tolist() + persona_texts)
review_vecs = X[:-len(persona_texts)]
persona_vecs = X[-len(persona_texts):]
assignments = []
for i in range(review_vecs.shape[0]):
sims = review_vecs[i].dot(persona_vecs.T).toarray().flatten()
idx = np.argmax(sims)
assignments.append(persona_defs[idx]["name"])
return assignments
def groq_bullets_persona(chart_desc, chart_data_text):
user_prompt = (
f"Summarize as exactly two bullet points the main insights for this chart: {chart_desc}. "
f"Here is the data: {chart_data_text}. "
"Provide a percentage if applicable. Just facts."
)
try:
chat_completion = groq_client.chat.completions.create(
model=GROQ_MODEL,
messages=[
{"role": "system", "content": PRODUCT_CONTEXT},
{"role": "user", "content": user_prompt}
],
max_tokens=80,
temperature=0.5,
)
bullets = chat_completion.choices[0].message.content.strip()
points = [line for line in bullets.splitlines() if line.strip().startswith(("-", "•"))]
return "\n".join(points[:2]) if len(points) >= 2 else "- " + bullets
except Exception:
return "- Summary not available.\n- (LLM error)"
# --- EMOTION PIPELINE (optional) ---
def emotion_pipeline(df):
try:
from transformers import pipeline
emo = pipeline(
"text-classification",
model="finiteautomata/bertweet-base-emotion-analysis", # much smaller than roberta-base!
top_k=None,
device=-1 # always use CPU, avoid meta-tensor bug
)
except Exception as e:
st.warning(f"Could not load emotion model, skipping emotion analysis: {e}")
df["main_emotion"] = "neutral"
return df
all_emotions = []
for t in df["review_text"]:
try:
emotions = emo(t[:512])
if isinstance(emotions, list) and len(emotions) and isinstance(emotions[0], list):
# Sometimes returns list of lists
emotions = emotions[0]
main_emo = sorted(emotions, key=lambda x: -x["score"])[0]["label"]
except Exception:
main_emo = "neutral"
all_emotions.append(main_emo)
df["main_emotion"] = all_emotions
return df
# ========== MAIN PIPELINE ========== #
with st.spinner("🔎 Analyzing your data... Please wait a few moments."):
df = load_reviews(CSV_PATH)
reviews = df["review_text"].dropna().tolist() if not df.empty else []
reviews = [t for t in reviews if "unreadable" not in t and "missing" not in t and t.strip()]
if reviews:
personas_raw = generate_personas(reviews, 4)
personas = parse_personas_bulletproof(personas_raw, 4)
if personas:
with open(PERSONA_PATH, "w", encoding="utf-8") as f:
json.dump(personas, f, ensure_ascii=False, indent=2)
st.session_state['personas'] = personas
st.success(f"{len(personas)} personas saved for next use.")
else:
personas = []
persona_colors = [neon_green, neon_blue, neon_pink, neon_orange]
persona_cycler = cycle(persona_colors)
persona_blocks = []
persona_names = []
# Persona grid (left-right)
if personas:
st.markdown("<br>", unsafe_allow_html=True)
grid_cols = st.columns(2)
for i, p in enumerate(personas):
c = next(persona_cycler)
col = grid_cols[i%2]
with col:
st.markdown(
f"<div style='background:linear-gradient(90deg,{c}18,#181830 95%);"
"padding:24px 26px 16px 26px;border-radius:18px;margin-bottom:24px;"
f"box-shadow:0 2px 22px {c}22;'>"
f"<h2 style='color:{c};margin-bottom:0.18em'>{p['name']}</h2>"
f"<div style='color:#fff;font-size:1.15em;font-weight:500;margin-bottom:10px'>Summary: {p['summary']}</div>"
f"<div style='color:{neon_pink};font-weight:700;font-size:1.08em;margin-bottom:2px'>Characteristics</div>"
f"<ul style='font-size:1.02em;margin-top:3px'>{''.join([f'<li>{b}</li>' for b in p['bullets']])}</ul>"
"</div>", unsafe_allow_html=True
)
persona_names.append(p["name"])
st.markdown("<hr>", unsafe_allow_html=True)
if personas and len(reviews) > 0:
# Assign reviews to persona via TF-IDF (fast)
persona_for_review = assign_review_to_persona_tfidf(df, personas)
df_reviews = df.copy()
df_reviews = df_reviews.iloc[:len(persona_for_review)].copy()
df_reviews["persona"] = persona_for_review
# --- Generate all summary stats for new graphs
# 1. Persona Review Share
persona_counts = df_reviews["persona"].value_counts()
# 2. Persona Sentiment
avg_sentiment = df_reviews.groupby("persona")["polarity"].mean()
# 3. Persona Review Length
avg_length = df_reviews.groupby("persona")["review_length"].mean()
# 4. Persona Emotion (optional)
if "main_emotion" not in df_reviews.columns:
df_reviews = emotion_pipeline(df_reviews)
emo_dist = df_reviews.groupby("persona")["main_emotion"].value_counts().unstack().fillna(0)
# --- Row 1: Pie and Sentiment Bar
c1, c2 = st.columns(2)
with c1:
st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Sales/Review Share by Persona</h3>", unsafe_allow_html=True)
fig = go.Figure(data=[go.Pie(labels=persona_counts.index, values=persona_counts.values, hole=0.45)])
fig.update_traces(textinfo='percent+label')
st.plotly_chart(fig, use_container_width=True)
st.markdown(block_markdown(
groq_bullets_persona("Sales/Review Share by Persona", persona_counts.to_dict()), neon_green
), unsafe_allow_html=True)
with c2:
st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Average Sentiment by Persona</h3>", unsafe_allow_html=True)
fig2 = go.Figure(data=[go.Bar(x=avg_sentiment.index, y=avg_sentiment.values, marker=dict(color=[neon_green, neon_blue, neon_pink, neon_orange]))])
fig2.update_layout(xaxis_title="Persona", yaxis_title="Avg Sentiment", font=dict(size=15))
st.plotly_chart(fig2, use_container_width=True)
st.markdown(block_markdown(
groq_bullets_persona("Average Sentiment by Persona", avg_sentiment.to_dict()), neon_blue
), unsafe_allow_html=True)
# --- Row 2: Review Length and Emotion Distribution
c3, c4 = st.columns(2)
with c3:
st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Persona vs. Review Length Distribution</h3>", unsafe_allow_html=True)
fig3 = go.Figure(data=[go.Bar(x=avg_length.index, y=avg_length.values, marker=dict(color=[neon_green, neon_blue, neon_pink, neon_orange]))])
fig3.update_layout(xaxis_title="Persona", yaxis_title="Avg Review Length", font=dict(size=15))
st.plotly_chart(fig3, use_container_width=True)
st.markdown(block_markdown(
groq_bullets_persona("Average review length (words) by persona", avg_length.to_dict()), neon_orange
), unsafe_allow_html=True)
with c4:
st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Persona vs. Emotion Distribution</h3>", unsafe_allow_html=True)
fig4 = go.Figure()
for idx, em in enumerate(emo_dist.columns):
fig4.add_trace(go.Bar(name=em, x=emo_dist.index, y=emo_dist[em].values))
fig4.update_layout(barmode='stack', xaxis_title="Persona", yaxis_title="Emotion Count", font=dict(size=15))
st.plotly_chart(fig4, use_container_width=True)
st.markdown(block_markdown(
groq_bullets_persona("Distribution of primary emotions per persona", emo_dist.to_dict()), neon_pink
), unsafe_allow_html=True)
# --- Persona-wise Highlights, grouped by persona with headings ---
st.markdown("<hr><h2 style='color:#fff'>Persona-wise Sentiment Highlights & Recommendations</h2>", unsafe_allow_html=True)
persona_grid = st.columns(2)
for idx, p in enumerate(personas):
persona_df = df_reviews[df_reviews["persona"] == p["name"]]
top_pos = persona_df[persona_df["polarity"] > 0]["review_text"].head(2).tolist()
top_neg = persona_df[persona_df["polarity"] < 0]["review_text"].head(2).tolist()
pos_summary = groq_bullets_persona(
f"Summarize two main positive sentiment points, with percentage, for persona '{p['name']}'.",
" ".join(top_pos)
) if top_pos else "No positive reviews."
neg_summary = groq_bullets_persona(
f"Summarize two main negative sentiment points, with percentage, for persona '{p['name']}'.",
" ".join(top_neg)
) if top_neg else "No negative reviews."
rec_prompt = (
f"You are a product marketing strategist. "
f"Based on the review highlights and persona details for '{p['name']}' "
f"(do not repeat the characteristics), write one concise or mention name of user, actionable product or marketing recommendation. Dont put * anywhere "
f"for the company to better engage this persona. "
f"Focus on practical actions the business can take (such as messaging, offers, features, or campaigns). "
f"Reply with 1-2 sentences, avoid restating the persona’s traits."
)
try:
rec_out = groq_client.chat.completions.create(
model=GROQ_MODEL,
messages=[
{"role": "system", "content": PRODUCT_CONTEXT},
{"role": "user", "content": rec_prompt}
],
max_tokens=80, temperature=0.5
).choices[0].message.content.strip()
except:
rec_out = "No recommendation available."
with persona_grid[idx % 2]:
st.markdown(
f"<div style='margin-bottom:38px;padding:18px 20px 8px 20px;border-radius:18px;"
f"background:linear-gradient(90deg,{persona_colors[idx%4]}22,#181830 100%);box-shadow:0 2px 22px {persona_colors[idx%4]}18;'>"
f"<h2 style='color:{persona_colors[idx%4]};font-size:1.35em;margin-bottom:0.3em'>{p['name']}</h2>"
f"<div style='color:#fff;font-size:1.13em;font-weight:400;margin-bottom:14px;'>{p['summary']}</div>"
"<div style='margin-bottom:16px'>"
f"<b style='color:{neon_green};font-size:1.1em;'>Top Positive Sentiments:</b><br>{block_markdown(pos_summary, neon_green)}"
"</div>"
"<div style='margin-bottom:16px'>"
f"<b style='color:{neon_pink};font-size:1.1em;'>Top Negative Sentiments:</b><br>{block_markdown(neg_summary, neon_pink)}"
"</div>"
"<div>"
f"<b style='color:{neon_yellow};font-size:1.1em;'>Recommendation:</b><br>{block_markdown(rec_out, neon_yellow)}"
"</div>"
"</div>", unsafe_allow_html=True
)
st.markdown("---")