cgd-ui-TEST

Sleeping

App Files Files Community

gigiliu12 commited on 9 days ago

Commit

c43bc47

verified ·

1 Parent(s): b40e2c2

Update logic

Browse files

Files changed (1) hide show

app.py +92 -103

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os, io, json, gc
 import streamlit as st
 import pandas as pd
@@ -6,119 +8,84 @@ import boto3, torch
 from sentence_transformers import SentenceTransformer, util
 # ────────────────────────────────────────────────────────────────────────
-# 1)  DB credentials (from HF secrets or env)  – original
 # ────────────────────────────────────────────────────────────────────────
-DB_HOST = os.getenv("DB_HOST")
 DB_PORT = os.getenv("DB_PORT", "5432")
 DB_NAME = os.getenv("DB_NAME")
 DB_USER = os.getenv("DB_USER")
 DB_PASSWORD = os.getenv("DB_PASSWORD")
 @st.cache_data(ttl=600)
 def get_data() -> pd.DataFrame:
-    try:
-        conn = psycopg2.connect(
-            host=DB_HOST,
-            dbname=DB_NAME,
-            user=DB_USER,
-            password=DB_PASSWORD,
-            sslmode="require",
-        )
-        query = """
-            SELECT id, country, year, section,
-                   question_code, question_text,
-                   answer_code, answer_text
-              FROM survey_info;
-        """
-        df_ = pd.read_sql_query(query, conn)
-        conn.close()
-        return df_
-    except Exception as e:
-        st.error(f"Failed to connect to the database: {e}")
-        st.stop()
-df = get_data()              # ← original DataFrame
-# Build a quick lookup row-index → DataFrame row for later
 row_lookup = {row.id: i for i, row in df.iterrows()}
 # ────────────────────────────────────────────────────────────────────────
-# 2)  Load embeddings + ids once per session  (S3) – new, cached
 # ────────────────────────────────────────────────────────────────────────
 @st.cache_resource
 def load_embeddings():
-    # credentials already in env (HF secrets) – boto3 will pick them up
     BUCKET = "cgd-embeddings-bucket"
-    KEY    = "survey_info_embeddings.pt"   # dict {'ids', 'embeddings'}
     buf = io.BytesIO()
     boto3.client("s3").download_fileobj(BUCKET, KEY, buf)
     buf.seek(0)
     ckpt = torch.load(buf, map_location="cpu")
     buf.close(); gc.collect()
-    if not (isinstance(ckpt, dict) and {"ids","embeddings"} <= ckpt.keys()):
         st.error("Bad checkpoint format in survey_info_embeddings.pt"); st.stop()
     return ckpt["ids"], ckpt["embeddings"]
 ids_list, emb_tensor = load_embeddings()
 # ────────────────────────────────────────────────────────────────────────
-# 3)  Streamlit UI – original filters + new semantic search
 # ────────────────────────────────────────────────────────────────────────
 st.title("🌍 CGD Survey Explorer (Live DB)")
 st.sidebar.header("🔎 Filter Questions")
 country_options = sorted(df["country"].dropna().unique())
 year_options    = sorted(df["year"].dropna().unique())
-selected_countries = st.sidebar.multiselect("Select Country/Countries", country_options)
-selected_years     = st.sidebar.multiselect("Select Year(s)", year_options)
 keyword = st.sidebar.text_input(
     "Keyword Search (Question text / Answer text / Question code)", ""
 )
 group_by_question = st.sidebar.checkbox("Group by Question Text")
-# ── new semantic search panel ───────────────────────────────────────────
 st.sidebar.markdown("---")
 st.sidebar.subheader("🧠 Semantic Search")
 sem_query = st.sidebar.text_input("Enter a natural-language query")
-if st.sidebar.button("Search", disabled=not sem_query.strip()):
-    with st.spinner("Embedding & searching…"):
-        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-        q_vec = model.encode(sem_query.strip(), convert_to_tensor=True).cpu()
-        scores = util.cos_sim(q_vec, emb_tensor)[0]
-        top_vals, top_idx = torch.topk(scores, k=10)   # grab extra
-        results = []
-        for score, emb_row in zip(top_vals.tolist(), top_idx.tolist()):
-            db_id = ids_list[emb_row]
-            if db_id in row_lookup:
-                row = df.iloc[row_lookup[db_id]]
-                if row["question_text"] and row["answer_text"]:
-                    results.append({
-                        "Score": f"{score:.3f}",
-                        "Country": row["country"],
-                        "Year": row["year"],
-                        "Question": row["question_text"],
-                        "Answer": row["answer_text"],
-                    })
-        if results:
-            st.subheader(f"🔍 Semantic Results ({len(results)} found)")
-            st.dataframe(pd.DataFrame(results).head(5))
-        else:
-            st.info("No semantic matches found.")
-st.markdown("---")
-# ── apply original filters ──────────────────────────────────────────────
-filtered = df[
-    (df["country"].isin(selected_countries) if selected_countries else True) &
-    (df["year"].isin(selected_years)        if selected_years else True)       &
     (
         df["question_text"].str.contains(keyword, case=False, na=False) |
         df["answer_text"].str.contains(keyword, case=False, na=False)   |
@@ -126,43 +93,65 @@ filtered = df[
     )
 ]
-# ── original output logic ───────────────────────
-if group_by_question:
-    st.subheader("📊 Grouped by Question Text")
-    grouped = (
-        filtered.groupby("question_text")
-        .agg({
-            "country": lambda x: sorted(set(x)),
-            "year":    lambda x: sorted(set(x)),
-            "answer_text": lambda x: list(x)[:3]
-        })
-        .reset_index()
-        .rename(columns={
-            "country": "Countries",
-            "year":    "Years",
-            "answer_text": "Sample Answers"
-        })
-    )
-    st.dataframe(grouped)
-    if grouped.empty:
-        st.info("No questions found with current filters.")
-else:
-    heading_parts = []
-    if selected_countries:
-        heading_parts.append("Countries: " + ", ".join(selected_countries))
-    if selected_years:
-        heading_parts.append("Years: " + ", ".join(map(str, selected_years)))
-    st.markdown("### Results for " + (" | ".join(heading_parts) if heading_parts else "All Countries and Years"))
-    st.dataframe(filtered[["country", "year", "question_text", "answer_text"]])
-    if filtered.empty:
-        st.info("No matching questions found.")

+#!/usr/bin/env python3
+# app.py  – CGD Survey Explorer + merged semantic search
 import os, io, json, gc
 import streamlit as st
 import pandas as pd
 from sentence_transformers import SentenceTransformer, util
 # ────────────────────────────────────────────────────────────────────────
+# 1)  Database credentials (provided via HF Secrets / env vars)
 # ────────────────────────────────────────────────────────────────────────
+DB_HOST = os.getenv("DB_HOST")            # set these in the Space’s Secrets
 DB_PORT = os.getenv("DB_PORT", "5432")
 DB_NAME = os.getenv("DB_NAME")
 DB_USER = os.getenv("DB_USER")
 DB_PASSWORD = os.getenv("DB_PASSWORD")
 @st.cache_data(ttl=600)
 def get_data() -> pd.DataFrame:
+    """Pull the full survey_info table (cached for 10 min)."""
+    conn = psycopg2.connect(
+        host=DB_HOST,
+        port=DB_PORT,
+        dbname=DB_NAME,
+        user=DB_USER,
+        password=DB_PASSWORD,
+        sslmode="require",
+    )
+    df_ = pd.read_sql_query(
+        """SELECT id, country, year, section,
+                  question_code, question_text,
+                  answer_code, answer_text
+             FROM survey_info;""",
+        conn,
+    )
+    conn.close()
+    return df_
+df = get_data()
 row_lookup = {row.id: i for i, row in df.iterrows()}
 # ────────────────────────────────────────────────────────────────────────
+# 2)  Pre-computed embeddings (ids + tensor) – download once per session
 # ────────────────────────────────────────────────────────────────────────
 @st.cache_resource
 def load_embeddings():
     BUCKET = "cgd-embeddings-bucket"
+    KEY    = "survey_info_embeddings.pt"  # contains {'ids', 'embeddings'}
     buf = io.BytesIO()
     boto3.client("s3").download_fileobj(BUCKET, KEY, buf)
     buf.seek(0)
     ckpt = torch.load(buf, map_location="cpu")
     buf.close(); gc.collect()
+    if not (isinstance(ckpt, dict) and {"ids", "embeddings"} <= ckpt.keys()):
         st.error("Bad checkpoint format in survey_info_embeddings.pt"); st.stop()
     return ckpt["ids"], ckpt["embeddings"]
 ids_list, emb_tensor = load_embeddings()
 # ────────────────────────────────────────────────────────────────────────
+# 3)  Streamlit UI
 # ────────────────────────────────────────────────────────────────────────
 st.title("🌍 CGD Survey Explorer (Live DB)")
 st.sidebar.header("🔎 Filter Questions")
 country_options = sorted(df["country"].dropna().unique())
 year_options    = sorted(df["year"].dropna().unique())
+sel_countries = st.sidebar.multiselect("Select Country/Countries", country_options)
+sel_years     = st.sidebar.multiselect("Select Year(s)", year_options)
 keyword = st.sidebar.text_input(
     "Keyword Search (Question text / Answer text / Question code)", ""
 )
 group_by_question = st.sidebar.checkbox("Group by Question Text")
+# --- Semantic-search input (kept in sidebar) ---------------------------
 st.sidebar.markdown("---")
 st.sidebar.subheader("🧠 Semantic Search")
 sem_query = st.sidebar.text_input("Enter a natural-language query")
+search_clicked = st.sidebar.button("Search", disabled=not sem_query.strip())
+# ── base_filtered: applies dropdown + keyword logic (always computed) ──
+base_filtered = df[
+    (df["country"].isin(sel_countries) if sel_countries else True) &
+    (df["year"].isin(sel_years)        if sel_years     else True) &
     (
         df["question_text"].str.contains(keyword, case=False, na=False) |
         df["answer_text"].str.contains(keyword, case=False, na=False)   |
     )
 ]
+# ────────────────────────────────────────────────────────────────────────
+# 4)  When the Search button is clicked → build merged table
+# ────────────────────────────────────────────────────────────────────────
+if search_clicked:
+    with st.spinner("Embedding & searching…"):
+        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        q_vec = model.encode(sem_query.strip(), convert_to_tensor=True).cpu()
+        sims = util.cos_sim(q_vec, emb_tensor)[0]
+        top_vals, top_idx = torch.topk(sims, k=50)   # get 50 candidates
+        sem_ids   = [ids_list[i] for i in top_idx.tolist()]
+        sem_rows  = df.loc[df["id"].isin(sem_ids)].copy()
+        score_map = dict(zip(sem_ids, top_vals.tolist()))
+        sem_rows["Score"] = sem_rows["id"].map(score_map)
+        sem_rows = sem_rows.sort_values("Score", ascending=False)
+        remainder = base_filtered.loc[~base_filtered["id"].isin(sem_ids)].copy()
+        remainder["Score"] = ""   # blank score for keyword-only rows
+        combined = pd.concat([sem_rows, remainder], ignore_index=True)
+    st.subheader(f"🔍 Combined Results ({len(combined)})")
+    st.dataframe(
+        combined[["Score", "country", "year", "question_text", "answer_text"]],
+        use_container_width=True,
+    )
+# ────────────────────────────────────────────────────────────────────────
+# 5)  No semantic query → use original keyword filter logic / grouping
+# ────────────────────────────────────────────────────────────────────────
+else:
+    if group_by_question:
+        st.subheader("📊 Grouped by Question Text")
+        grouped = (
+            base_filtered.groupby("question_text")
+            .agg({
+                "country": lambda x: sorted(set(x)),
+                "year":    lambda x: sorted(set(x)),
+                "answer_text": lambda x: list(x)[:3]
+            })
+            .reset_index()
+            .rename(columns={
+                "country": "Countries",
+                "year":    "Years",
+                "answer_text": "Sample Answers"
+            })
+        )
+        st.dataframe(grouped, use_container_width=True)
+        if grouped.empty:
+            st.info("No questions found with current filters.")
+    else:
+        heading = []
+        if sel_countries: heading.append("Countries: " + ", ".join(sel_countries))
+        if sel_years:     heading.append("Years: " + ", ".join(map(str, sel_years)))
+        st.markdown("### Results for " + (" | ".join(heading) if heading else "All Countries and Years"))
+        st.dataframe(
+            base_filtered[["country", "year", "question_text", "answer_text"]],
+            use_container_width=True,
+        )
+        if base_filtered.empty:
+            st.info("No matching questions found.")