import streamlit as st import pandas as pd import numpy as np import os import re from groq import Groq import plotly.graph_objs as go from collections import defaultdict from itertools import cycle import json from dotenv import load_dotenv PERSONA_PATH = os.getenv("PERSONA_PATH", "/tmp/personas.json") # Set HuggingFace cache directories to /tmp for cloud hosting (permission safe) os.environ["TRANSFORMERS_CACHE"] = os.getenv("TRANSFORMERS_CACHE", "/tmp/hf_cache") os.environ["HF_HOME"] = os.getenv("HF_HOME", "/tmp/huggingface") os.environ["HF_DATASETS_CACHE"] = os.getenv("HF_DATASETS_CACHE", "/tmp/huggingface") # --- THEME COLORS --- neon_blue = "#00fff7" neon_green = "#7CFC00" neon_pink = "#F72585" neon_yellow = "#FFF600" neon_bg = "#181830" neon_orange = "#FFB347" neon_dark = "#202037" load_dotenv() # load .env file GROQ_API_KEY = os.environ.get("GROQ_API_KEY") # --- CONFIG --- GROQ_MODEL = "llama3-70b-8192" groq_client = Groq(api_key=GROQ_API_KEY) PRODUCT_CONTEXT = ( "You are an AI market research expert analyzing customer reviews for a chocolate-flavoured whey protein powder. " "Generate user personas based on patterns and diversity in the reviews." ) CSV_PATH = "src/data_with_text.csv" st.set_page_config(page_title="Persona Lab", layout="wide", initial_sidebar_state="collapsed") st.markdown( "

🎭 Persona Lab

", unsafe_allow_html=True ) # Set dark theme programmatically st.markdown( """ """, unsafe_allow_html=True ) st.markdown( f"""
Ready to peek inside the minds of your customers? This is your sandbox for uncovering who buys, why they rave, and what they crave—powered by real reviews and sharp AI. Dive in, explore the personas that drive your market, and see your brand through their eyes (and taste buds)!
""", unsafe_allow_html=True ) # --- NAVIGATION BUTTONS --- st.markdown(""" """, unsafe_allow_html=True) st.markdown("""
🏠 Home 🚀 New Product Launch

""", unsafe_allow_html=True) def block_markdown(text, color): text = text.replace('\n', '
') return ( f'
' f'{text}
' ) @st.cache_data(show_spinner=True) def load_reviews(csv_path): if not os.path.exists(csv_path): st.error(f"CSV file not found: {csv_path}") return pd.DataFrame() df = pd.read_csv(csv_path) if "polarity" not in df.columns: try: from transformers import pipeline sa = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") df["polarity"] = df["review_text"].apply(lambda x: 1 if sa(x)[0]["label"] == "POSITIVE" else -1) except Exception as e: st.warning("Could not compute sentiment scores. All reviews set to neutral (0).") df["polarity"] = 0 if "review_length" not in df.columns: df["review_length"] = df["review_text"].apply(lambda x: len(str(x).split())) return df def generate_personas(review_texts, n_personas=4): prompt = ( f"Read the following customer reviews for a chocolate-flavored whey protein powder. " f"Based on the language, interests, and context, segment these users into {n_personas} distinct personas. " "For each persona, provide:\n" "1. Persona Name starting with emoji\n" "2. A one-line summary\n" "3. Five detailed bullet points describing their characteristics, needs, goals, or behaviors (each bullet should be specific and insightful, not generic).\n" "Give the answer as a numbered list, one for each persona. Format:\n" "1. [Emoji] Persona Name\nSummary: ...\n- ...\n- ...\n- ...\n- ...\n- ...\n" "\nREVIEWS:\n" + "\n".join(review_texts[:120])[:3600] ) try: chat_completion = groq_client.chat.completions.create( model=GROQ_MODEL, messages=[ {"role": "system", "content": PRODUCT_CONTEXT}, {"role": "user", "content": prompt} ], max_tokens=900, temperature=0.6, ) return chat_completion.choices[0].message.content.strip() except Exception as e: return f"Error generating personas: {e}" def parse_personas_bulletproof(llm_output, n=4): lines = llm_output.splitlines() persona_headers = [] for i, line in enumerate(lines): if re.match(r"^([0-9]{1,2}[.)-]?\s*)?[\U0001F300-\U0001FAFF]", line.strip()): persona_headers.append(i) persona_blocks = [] for idx, start in enumerate(persona_headers): end = persona_headers[idx+1] if idx+1 < len(persona_headers) else len(lines) persona_blocks.append(lines[start:end]) personas = [] for block in persona_blocks[:n]: name_line = re.sub(r"^([0-9]{1,2}[.)-]?\s*)?", "", block[0]).strip().replace("**", "") summary = "" bullets = [] for l in block[1:]: l = l.strip() if not l: continue if not summary and ("summary" in l.lower() or not l.startswith(("-", "•", "*", "+"))): summary = re.sub(r"^summary[:\- ]*", "", l, flags=re.I) elif l.startswith(("-", "•", "*", "+")) or re.match(r"^[0-9]{1,2}[.)-]", l): b = re.sub(r"^[-•*+0-9. ]+", "", l) if b: bullets.append(b) personas.append({ "name": name_line, "summary": summary, "bullets": bullets[:5] }) return personas def assign_review_to_persona_tfidf(df, persona_defs): # Use TF-IDF cosine similarity for assignment (faster than LLM for large data) from sklearn.feature_extraction.text import TfidfVectorizer persona_texts = [p["summary"] + " " + " ".join(p["bullets"]) for p in persona_defs] tfidf = TfidfVectorizer(stop_words='english') X = tfidf.fit_transform(df["review_text"].tolist() + persona_texts) review_vecs = X[:-len(persona_texts)] persona_vecs = X[-len(persona_texts):] assignments = [] for i in range(review_vecs.shape[0]): sims = review_vecs[i].dot(persona_vecs.T).toarray().flatten() idx = np.argmax(sims) assignments.append(persona_defs[idx]["name"]) return assignments def groq_bullets_persona(chart_desc, chart_data_text): user_prompt = ( f"Summarize as exactly two bullet points the main insights for this chart: {chart_desc}. " f"Here is the data: {chart_data_text}. " "Provide a percentage if applicable. Just facts." ) try: chat_completion = groq_client.chat.completions.create( model=GROQ_MODEL, messages=[ {"role": "system", "content": PRODUCT_CONTEXT}, {"role": "user", "content": user_prompt} ], max_tokens=80, temperature=0.5, ) bullets = chat_completion.choices[0].message.content.strip() points = [line for line in bullets.splitlines() if line.strip().startswith(("-", "•"))] return "\n".join(points[:2]) if len(points) >= 2 else "- " + bullets except Exception: return "- Summary not available.\n- (LLM error)" # --- EMOTION PIPELINE (optional) --- def emotion_pipeline(df): try: from transformers import pipeline emo = pipeline( "text-classification", model="finiteautomata/bertweet-base-emotion-analysis", # much smaller than roberta-base! top_k=None, device=-1 # always use CPU, avoid meta-tensor bug ) except Exception as e: st.warning(f"Could not load emotion model, skipping emotion analysis: {e}") df["main_emotion"] = "neutral" return df all_emotions = [] for t in df["review_text"]: try: emotions = emo(t[:512]) if isinstance(emotions, list) and len(emotions) and isinstance(emotions[0], list): # Sometimes returns list of lists emotions = emotions[0] main_emo = sorted(emotions, key=lambda x: -x["score"])[0]["label"] except Exception: main_emo = "neutral" all_emotions.append(main_emo) df["main_emotion"] = all_emotions return df # ========== MAIN PIPELINE ========== # with st.spinner("🔎 Analyzing your data... Please wait a few moments."): df = load_reviews(CSV_PATH) reviews = df["review_text"].dropna().tolist() if not df.empty else [] reviews = [t for t in reviews if "unreadable" not in t and "missing" not in t and t.strip()] if reviews: personas_raw = generate_personas(reviews, 4) personas = parse_personas_bulletproof(personas_raw, 4) if personas: with open(PERSONA_PATH, "w", encoding="utf-8") as f: json.dump(personas, f, ensure_ascii=False, indent=2) st.session_state['personas'] = personas st.success(f"{len(personas)} personas saved for next use.") else: personas = [] persona_colors = [neon_green, neon_blue, neon_pink, neon_orange] persona_cycler = cycle(persona_colors) persona_blocks = [] persona_names = [] # Persona grid (left-right) if personas: st.markdown("
", unsafe_allow_html=True) grid_cols = st.columns(2) for i, p in enumerate(personas): c = next(persona_cycler) col = grid_cols[i%2] with col: st.markdown( f"
" f"

{p['name']}

" f"
Summary: {p['summary']}
" f"
Characteristics
" f"" "
", unsafe_allow_html=True ) persona_names.append(p["name"]) st.markdown("
", unsafe_allow_html=True) if personas and len(reviews) > 0: # Assign reviews to persona via TF-IDF (fast) persona_for_review = assign_review_to_persona_tfidf(df, personas) df_reviews = df.copy() df_reviews = df_reviews.iloc[:len(persona_for_review)].copy() df_reviews["persona"] = persona_for_review # --- Generate all summary stats for new graphs # 1. Persona Review Share persona_counts = df_reviews["persona"].value_counts() # 2. Persona Sentiment avg_sentiment = df_reviews.groupby("persona")["polarity"].mean() # 3. Persona Review Length avg_length = df_reviews.groupby("persona")["review_length"].mean() # 4. Persona Emotion (optional) if "main_emotion" not in df_reviews.columns: df_reviews = emotion_pipeline(df_reviews) emo_dist = df_reviews.groupby("persona")["main_emotion"].value_counts().unstack().fillna(0) # --- Row 1: Pie and Sentiment Bar c1, c2 = st.columns(2) with c1: st.markdown("

Sales/Review Share by Persona

", unsafe_allow_html=True) fig = go.Figure(data=[go.Pie(labels=persona_counts.index, values=persona_counts.values, hole=0.45)]) fig.update_traces(textinfo='percent+label') st.plotly_chart(fig, use_container_width=True) st.markdown(block_markdown( groq_bullets_persona("Sales/Review Share by Persona", persona_counts.to_dict()), neon_green ), unsafe_allow_html=True) with c2: st.markdown("

Average Sentiment by Persona

", unsafe_allow_html=True) fig2 = go.Figure(data=[go.Bar(x=avg_sentiment.index, y=avg_sentiment.values, marker=dict(color=[neon_green, neon_blue, neon_pink, neon_orange]))]) fig2.update_layout(xaxis_title="Persona", yaxis_title="Avg Sentiment", font=dict(size=15)) st.plotly_chart(fig2, use_container_width=True) st.markdown(block_markdown( groq_bullets_persona("Average Sentiment by Persona", avg_sentiment.to_dict()), neon_blue ), unsafe_allow_html=True) # --- Row 2: Review Length and Emotion Distribution c3, c4 = st.columns(2) with c3: st.markdown("

Persona vs. Review Length Distribution

", unsafe_allow_html=True) fig3 = go.Figure(data=[go.Bar(x=avg_length.index, y=avg_length.values, marker=dict(color=[neon_green, neon_blue, neon_pink, neon_orange]))]) fig3.update_layout(xaxis_title="Persona", yaxis_title="Avg Review Length", font=dict(size=15)) st.plotly_chart(fig3, use_container_width=True) st.markdown(block_markdown( groq_bullets_persona("Average review length (words) by persona", avg_length.to_dict()), neon_orange ), unsafe_allow_html=True) with c4: st.markdown("

Persona vs. Emotion Distribution

", unsafe_allow_html=True) fig4 = go.Figure() for idx, em in enumerate(emo_dist.columns): fig4.add_trace(go.Bar(name=em, x=emo_dist.index, y=emo_dist[em].values)) fig4.update_layout(barmode='stack', xaxis_title="Persona", yaxis_title="Emotion Count", font=dict(size=15)) st.plotly_chart(fig4, use_container_width=True) st.markdown(block_markdown( groq_bullets_persona("Distribution of primary emotions per persona", emo_dist.to_dict()), neon_pink ), unsafe_allow_html=True) # --- Persona-wise Highlights, grouped by persona with headings --- st.markdown("

Persona-wise Sentiment Highlights & Recommendations

", unsafe_allow_html=True) persona_grid = st.columns(2) for idx, p in enumerate(personas): persona_df = df_reviews[df_reviews["persona"] == p["name"]] top_pos = persona_df[persona_df["polarity"] > 0]["review_text"].head(2).tolist() top_neg = persona_df[persona_df["polarity"] < 0]["review_text"].head(2).tolist() pos_summary = groq_bullets_persona( f"Summarize two main positive sentiment points, with percentage, for persona '{p['name']}'.", " ".join(top_pos) ) if top_pos else "No positive reviews." neg_summary = groq_bullets_persona( f"Summarize two main negative sentiment points, with percentage, for persona '{p['name']}'.", " ".join(top_neg) ) if top_neg else "No negative reviews." rec_prompt = ( f"You are a product marketing strategist. " f"Based on the review highlights and persona details for '{p['name']}' " f"(do not repeat the characteristics), write one concise or mention name of user, actionable product or marketing recommendation. Dont put * anywhere " f"for the company to better engage this persona. " f"Focus on practical actions the business can take (such as messaging, offers, features, or campaigns). " f"Reply with 1-2 sentences, avoid restating the persona’s traits." ) try: rec_out = groq_client.chat.completions.create( model=GROQ_MODEL, messages=[ {"role": "system", "content": PRODUCT_CONTEXT}, {"role": "user", "content": rec_prompt} ], max_tokens=80, temperature=0.5 ).choices[0].message.content.strip() except: rec_out = "No recommendation available." with persona_grid[idx % 2]: st.markdown( f"
" f"

{p['name']}

" f"
{p['summary']}
" "
" f"Top Positive Sentiments:
{block_markdown(pos_summary, neon_green)}" "
" "
" f"Top Negative Sentiments:
{block_markdown(neg_summary, neon_pink)}" "
" "
" f"Recommendation:
{block_markdown(rec_out, neon_yellow)}" "
" "
", unsafe_allow_html=True ) st.markdown("---")