cgd-ui-TEST

Sleeping

App Files Files Community

gigiliu12 commited on 10 days ago

Commit

6969959

verified ·

1 Parent(s): 6758cdb

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -109

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-# app.py  – CGD Survey Explorer + merged semantic search
 import os, io, json, gc
 import streamlit as st
 import pandas as pd
@@ -8,90 +6,119 @@ import boto3, torch
 from sentence_transformers import SentenceTransformer, util
 # ────────────────────────────────────────────────────────────────────────
-# 1)  Database credentials (provided via HF Secrets / env vars)
 # ────────────────────────────────────────────────────────────────────────
-DB_HOST = os.getenv("DB_HOST")            # set these in the Space’s Secrets
 DB_PORT = os.getenv("DB_PORT", "5432")
 DB_NAME = os.getenv("DB_NAME")
 DB_USER = os.getenv("DB_USER")
 DB_PASSWORD = os.getenv("DB_PASSWORD")
 @st.cache_data(ttl=600)
 def get_data() -> pd.DataFrame:
-    """Pull the full survey_info table (cached for 10 min)."""
-    conn = psycopg2.connect(
-        host=DB_HOST,
-        port=DB_PORT,
-        dbname=DB_NAME,
-        user=DB_USER,
-        password=DB_PASSWORD,
-        sslmode="require",
-    )
-    df_ = pd.read_sql_query(
-        """SELECT id, country, year, section,
-                  question_code, question_text,
-                  answer_code, answer_text
-             FROM survey_info;""",
-        conn,
-    )
-    conn.close()
-    return df_
-df = get_data()
 row_lookup = {row.id: i for i, row in df.iterrows()}
 # ────────────────────────────────────────────────────────────────────────
-# 2)  Pre-computed embeddings (ids + tensor) – download once per session
 # ────────────────────────────────────────────────────────────────────────
 @st.cache_resource
 def load_embeddings():
     BUCKET = "cgd-embeddings-bucket"
-    KEY    = "survey_info_embeddings.pt"  # contains {'ids', 'embeddings'}
     buf = io.BytesIO()
     boto3.client("s3").download_fileobj(BUCKET, KEY, buf)
     buf.seek(0)
     ckpt = torch.load(buf, map_location="cpu")
     buf.close(); gc.collect()
-    if not (isinstance(ckpt, dict) and {"ids", "embeddings"} <= ckpt.keys()):
         st.error("Bad checkpoint format in survey_info_embeddings.pt"); st.stop()
     return ckpt["ids"], ckpt["embeddings"]
 ids_list, emb_tensor = load_embeddings()
-@st.cache_resource
-def get_st_model():
-    # force CPU so we avoid the meta-tensor copy error
-    return SentenceTransformer(
-        "sentence-transformers/all-MiniLM-L6-v2",
-        device="cpu",
-    )
 # ────────────────────────────────────────────────────────────────────────
-# 3)  Streamlit UI
 # ────────────────────────────────────────────────────────────────────────
 st.title("🌍 CGD Survey Explorer (Live DB)")
 st.sidebar.header("🔎 Filter Questions")
 country_options = sorted(df["country"].dropna().unique())
 year_options    = sorted(df["year"].dropna().unique())
-sel_countries = st.sidebar.multiselect("Select Country/Countries", country_options)
-sel_years     = st.sidebar.multiselect("Select Year(s)", year_options)
 keyword = st.sidebar.text_input(
     "Keyword Search (Question text / Answer text / Question code)", ""
 )
 group_by_question = st.sidebar.checkbox("Group by Question Text")
-# --- Semantic-search input (kept in sidebar) ---------------------------
 st.sidebar.markdown("---")
 st.sidebar.subheader("🧠 Semantic Search")
 sem_query = st.sidebar.text_input("Enter a natural-language query")
-search_clicked = st.sidebar.button("Search", disabled=not sem_query.strip())
-# ── base_filtered: applies dropdown + keyword logic (always computed) ──
-base_filtered = df[
-    (df["country"].isin(sel_countries) if sel_countries else True) &
-    (df["year"].isin(sel_years)        if sel_years     else True) &
     (
         df["question_text"].str.contains(keyword, case=False, na=False) |
         df["answer_text"].str.contains(keyword, case=False, na=False)   |
@@ -99,72 +126,43 @@ base_filtered = df[
     )
 ]
-# ────────────────────────────────────────────────────────────────────────
-# 4)  When the Search button is clicked → build merged table
-# ────────────────────────────────────────────────────────────────────────
-if search_clicked:
-    with st.spinner("Embedding & searching…"):
-        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-        q_vec = model.encode(sem_query.strip(), convert_to_tensor=True).cpu()
-         model = get_st_model()                         # ← cached instance
-        q_vec = model.encode(
-            sem_query.strip(),
-            convert_to_tensor=True,
-            device="cpu"
-        ).cpu()
-        sims = util.cos_sim(q_vec, emb_tensor)[0]
-        top_vals, top_idx = torch.topk(sims, k=50)   # get 50 candidates
-        sem_ids   = [ids_list[i] for i in top_idx.tolist()]
-        sem_rows  = df.loc[df["id"].isin(sem_ids)].copy()
-        score_map = dict(zip(sem_ids, top_vals.tolist()))
-        sem_rows["Score"] = sem_rows["id"].map(score_map)
-        sem_rows = sem_rows.sort_values("Score", ascending=False)
-        remainder = base_filtered.loc[~base_filtered["id"].isin(sem_ids)].copy()
-        remainder["Score"] = ""   # blank score for keyword-only rows
-        combined = pd.concat([sem_rows, remainder], ignore_index=True)
-    st.subheader(f"🔍 Combined Results ({len(combined)})")
-    st.dataframe(
-        combined[["Score", "country", "year", "question_text", "answer_text"]],
-        use_container_width=True,
     )
-# ────────────────────────────────────────────────────────────────────────
-# 5)  No semantic query → use original keyword filter logic / grouping
-# ────────────────────────────────────────────────────────────────────────
 else:
-    if group_by_question:
-        st.subheader("📊 Grouped by Question Text")
-        grouped = (
-            base_filtered.groupby("question_text")
-            .agg({
-                "country": lambda x: sorted(set(x)),
-                "year":    lambda x: sorted(set(x)),
-                "answer_text": lambda x: list(x)[:3]
-            })
-            .reset_index()
-            .rename(columns={
-                "country": "Countries",
-                "year":    "Years",
-                "answer_text": "Sample Answers"
-            })
-        )
-        st.dataframe(grouped, use_container_width=True)
-        if grouped.empty:
-            st.info("No questions found with current filters.")
-    else:
-        heading = []
-        if sel_countries: heading.append("Countries: " + ", ".join(sel_countries))
-        if sel_years:     heading.append("Years: " + ", ".join(map(str, sel_years)))
-        st.markdown("### Results for " + (" | ".join(heading) if heading else "All Countries and Years"))
-        st.dataframe(
-            base_filtered[["country", "year", "question_text", "answer_text"]],
-            use_container_width=True,
-        )
-        if base_filtered.empty:
-            st.info("No matching questions found.")

 import os, io, json, gc
 import streamlit as st
 import pandas as pd
 from sentence_transformers import SentenceTransformer, util
 # ────────────────────────────────────────────────────────────────────────
+# 1)  DB credentials (from HF secrets or env)  – original
 # ────────────────────────────────────────────────────────────────────────
+DB_HOST = os.getenv("DB_HOST")
 DB_PORT = os.getenv("DB_PORT", "5432")
 DB_NAME = os.getenv("DB_NAME")
 DB_USER = os.getenv("DB_USER")
 DB_PASSWORD = os.getenv("DB_PASSWORD")
 @st.cache_data(ttl=600)
 def get_data() -> pd.DataFrame:
+    try:
+        conn = psycopg2.connect(
+            host=DB_HOST,
+            dbname=DB_NAME,
+            user=DB_USER,
+            password=DB_PASSWORD,
+            sslmode="require",
+        )
+        query = """
+            SELECT id, country, year, section,
+                   question_code, question_text,
+                   answer_code, answer_text
+              FROM survey_info;
+        """
+        df_ = pd.read_sql_query(query, conn)
+        conn.close()
+        return df_
+    except Exception as e:
+        st.error(f"Failed to connect to the database: {e}")
+        st.stop()
+df = get_data()              # ← original DataFrame
+# Build a quick lookup row-index → DataFrame row for later
 row_lookup = {row.id: i for i, row in df.iterrows()}
 # ────────────────────────────────────────────────────────────────────────
+# 2)  Load embeddings + ids once per session  (S3) – new, cached
 # ────────────────────────────────────────────────────────────────────────
 @st.cache_resource
 def load_embeddings():
+    # credentials already in env (HF secrets) – boto3 will pick them up
     BUCKET = "cgd-embeddings-bucket"
+    KEY    = "survey_info_embeddings.pt"   # dict {'ids', 'embeddings'}
     buf = io.BytesIO()
     boto3.client("s3").download_fileobj(BUCKET, KEY, buf)
     buf.seek(0)
     ckpt = torch.load(buf, map_location="cpu")
     buf.close(); gc.collect()
+    if not (isinstance(ckpt, dict) and {"ids","embeddings"} <= ckpt.keys()):
         st.error("Bad checkpoint format in survey_info_embeddings.pt"); st.stop()
     return ckpt["ids"], ckpt["embeddings"]
 ids_list, emb_tensor = load_embeddings()
 # ────────────────────────────────────────────────────────────────────────
+# 3)  Streamlit UI – original filters + new semantic search
 # ────────────────────────────────────────────────────────────────────────
 st.title("🌍 CGD Survey Explorer (Live DB)")
 st.sidebar.header("🔎 Filter Questions")
 country_options = sorted(df["country"].dropna().unique())
 year_options    = sorted(df["year"].dropna().unique())
+selected_countries = st.sidebar.multiselect("Select Country/Countries", country_options)
+selected_years     = st.sidebar.multiselect("Select Year(s)", year_options)
 keyword = st.sidebar.text_input(
     "Keyword Search (Question text / Answer text / Question code)", ""
 )
 group_by_question = st.sidebar.checkbox("Group by Question Text")
+# ── new semantic search panel ───────────────────────────────────────────
 st.sidebar.markdown("---")
 st.sidebar.subheader("🧠 Semantic Search")
 sem_query = st.sidebar.text_input("Enter a natural-language query")
+if st.sidebar.button("Search", disabled=not sem_query.strip()):
+    with st.spinner("Embedding & searching…"):
+        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        q_vec = model.encode(sem_query.strip(), convert_to_tensor=True).cpu()
+        scores = util.cos_sim(q_vec, emb_tensor)[0]
+        top_vals, top_idx = torch.topk(scores, k=10)   # grab extra
+        results = []
+        for score, emb_row in zip(top_vals.tolist(), top_idx.tolist()):
+            db_id = ids_list[emb_row]
+            if db_id in row_lookup:
+                row = df.iloc[row_lookup[db_id]]
+                if row["question_text"] and row["answer_text"]:
+                    results.append({
+                        "Score": f"{score:.3f}",
+                        "Country": row["country"],
+                        "Year": row["year"],
+                        "Question": row["question_text"],
+                        "Answer": row["answer_text"],
+                    })
+        if results:
+            st.subheader(f"🔍 Semantic Results ({len(results)} found)")
+            st.dataframe(pd.DataFrame(results).head(5))
+        else:
+            st.info("No semantic matches found.")
+st.markdown("---")
+# ── apply original filters ──────────────────────────────────────────────
+filtered = df[
+    (df["country"].isin(selected_countries) if selected_countries else True) &
+    (df["year"].isin(selected_years)        if selected_years else True)       &
     (
         df["question_text"].str.contains(keyword, case=False, na=False) |
         df["answer_text"].str.contains(keyword, case=False, na=False)   |
     )
 ]
+# ── original output logic ───────────────────────
+if group_by_question:
+    st.subheader("📊 Grouped by Question Text")
+    grouped = (
+        filtered.groupby("question_text")
+        .agg({
+            "country": lambda x: sorted(set(x)),
+            "year":    lambda x: sorted(set(x)),
+            "answer_text": lambda x: list(x)[:3]
+        })
+        .reset_index()
+        .rename(columns={
+            "country": "Countries",
+            "year":    "Years",
+            "answer_text": "Sample Answers"
+        })
     )
+    st.dataframe(grouped)
+    if grouped.empty:
+        st.info("No questions found with current filters.")
 else:
+    heading_parts = []
+    if selected_countries:
+        heading_parts.append("Countries: " + ", ".join(selected_countries))
+    if selected_years:
+        heading_parts.append("Years: " + ", ".join(map(str, selected_years)))
+    st.markdown("### Results for " + (" | ".join(heading_parts) if heading_parts else "All Countries and Years"))
+    st.dataframe(filtered[["country", "year", "question_text", "answer_text"]])
+    if filtered.empty:
+        st.info("No matching questions found.")