cgd-ui

Running

App Files Files Community

cgd-ui / app.py

gigiliu12

Update app.py

e35f77b verified 9 days ago

raw

history blame

7.18 kB


	import os, io, json, gc
	import streamlit as st
	import pandas as pd
	import psycopg2
	import boto3, torch
	from sentence_transformers import SentenceTransformer, util

	# ────────────────────────────────────────────────────────────────────────
	# 1) DB credentials (from HF secrets or env) – original
	# ────────────────────────────────────────────────────────────────────────
	DB_HOST = os.getenv("DB_HOST")
	DB_PORT = os.getenv("DB_PORT", "5432")
	DB_NAME = os.getenv("DB_NAME")
	DB_USER = os.getenv("DB_USER")
	DB_PASSWORD = os.getenv("DB_PASSWORD")

	@st.cache_data(ttl=600)
	def get_data() -> pd.DataFrame:
	try:
	conn = psycopg2.connect(
	host=DB_HOST,
	port=DB_PORT,
	dbname=DB_NAME,
	user=DB_USER,
	password=DB_PASSWORD,
	sslmode="require",
	)
	query = """
	SELECT id, country, year, section,
	question_code, question_text,
	answer_code, answer_text
	FROM survey_info;
	"""
	df_ = pd.read_sql_query(query, conn)
	conn.close()
	return df_
	except Exception as e:
	st.error(f"Failed to connect to the database: {e}")
	st.stop()

	df = get_data() # ← original DataFrame

	# Build a quick lookup row-index → DataFrame row for later
	row_lookup = {row.id: i for i, row in df.iterrows()}

	# ────────────────────────────────────────────────────────────────────────
	# 2) Load embeddings + ids once per session (S3) – new, cached
	# ────────────────────────────────────────────────────────────────────────
	@st.cache_resource
	def load_embeddings():
	# credentials already in env (HF secrets) – boto3 will pick them up
	BUCKET = "cgd-embeddings-bucket"
	KEY = "survey_info_embeddings.pt" # dict {'ids', 'embeddings'}
	buf = io.BytesIO()
	boto3.client("s3").download_fileobj(BUCKET, KEY, buf)
	buf.seek(0)
	ckpt = torch.load(buf, map_location="cpu")
	buf.close(); gc.collect()

	if not (isinstance(ckpt, dict) and {"ids","embeddings"} <= ckpt.keys()):
	st.error("Bad checkpoint format in survey_info_embeddings.pt"); st.stop()

	return ckpt["ids"], ckpt["embeddings"]

	ids_list, emb_tensor = load_embeddings()

	# ────────────────────────────────────────────────────────────────────────
	# 3) Streamlit UI – original filters + new semantic search
	# ────────────────────────────────────────────────────────────────────────
	st.title("🌍 CGD Survey Explorer (Live DB)")

	st.sidebar.header("🔎 Filter Questions")

	country_options = sorted(df["country"].dropna().unique())
	year_options = sorted(df["year"].dropna().unique())

	selected_countries = st.sidebar.multiselect("Select Country/Countries", country_options)
	selected_years = st.sidebar.multiselect("Select Year(s)", year_options)
	keyword = st.sidebar.text_input(
	"Keyword Search (Question text / Answer text / Question code)", ""
	)
	group_by_question = st.sidebar.checkbox("Group by Question Text")

	# ── new semantic search panel ───────────────────────────────────────────
	st.sidebar.markdown("---")
	st.sidebar.subheader("🧠 Semantic Search")
	sem_query = st.sidebar.text_input("Enter a natural-language query")
	if st.sidebar.button("Search", disabled=not sem_query.strip()):
	with st.spinner("Embedding & searching…"):
	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	q_vec = model.encode(sem_query.strip(), convert_to_tensor=True).cpu()
	scores = util.cos_sim(q_vec, emb_tensor)[0]
	top_vals, top_idx = torch.topk(scores, k=10) # grab extra

	results = []
	for score, emb_row in zip(top_vals.tolist(), top_idx.tolist()):
	db_id = ids_list[emb_row]
	if db_id in row_lookup:
	row = df.iloc[row_lookup[db_id]]
	if row["question_text"] and row["answer_text"]:
	results.append({
	"Score": f"{score:.3f}",
	"Country": row["country"],
	"Year": row["year"],
	"Question": row["question_text"],
	"Answer": row["answer_text"],
	})
	if results:
	st.subheader(f"🔍 Semantic Results ({len(results)} found)")
	st.dataframe(pd.DataFrame(results).head(5))
	else:
	st.info("No semantic matches found.")

	st.markdown("---")

	# ── apply original filters ──────────────────────────────────────────────
	filtered = df[
	(df["country"].isin(selected_countries) if selected_countries else True) &
	(df["year"].isin(selected_years) if selected_years else True) &
	(
	df["question_text"].str.contains(keyword, case=False, na=False) \|
	df["answer_text"].str.contains(keyword, case=False, na=False) \|
	df["question_code"].astype(str).str.contains(keyword, case=False, na=False)
	)
	]

	# ── original output logic ───────────────────────
	if group_by_question:
	st.subheader("📊 Grouped by Question Text")
	grouped = (
	filtered.groupby("question_text")
	.agg({
	"country": lambda x: sorted(set(x)),
	"year": lambda x: sorted(set(x)),
	"answer_text": lambda x: list(x)[:3]
	})
	.reset_index()
	.rename(columns={
	"country": "Countries",
	"year": "Years",
	"answer_text": "Sample Answers"
	})
	)
	st.dataframe(grouped)
	if grouped.empty:
	st.info("No questions found with current filters.")
	else:
	heading_parts = []
	if selected_countries:
	heading_parts.append("Countries: " + ", ".join(selected_countries))
	if selected_years:
	heading_parts.append("Years: " + ", ".join(map(str, selected_years)))
	st.markdown("### Results for " + (" \| ".join(heading_parts) if heading_parts else "All Countries and Years"))
	st.dataframe(filtered[["country", "year", "question_text", "answer_text"]])
	if filtered.empty:
	st.info("No matching questions found.")