Spaces:

vanderbilt-dsi
/

cgd-ui-panel

Running

App Files Files Community

myshirk commited on 8 days ago

Commit

0acec27

verified ·

1 Parent(s): 8041be5

add group by function. allow filters to adjust automatically

Browse files

Files changed (1) hide show

app.py +35 -18

app.py CHANGED Viewed

@@ -87,13 +87,28 @@ result_table = pn.widgets.Tabulator(
 )
 # ──────────────────────────────────────────────────────────────────────
-# 4) Semantic Search with Filtering
 # ──────────────────────────────────────────────────────────────────────
-def semantic_search(event=None):
-    """Run filtered view if no semantic query; otherwise do semantic within filtered subset."""
-    query = w_semquery.value.strip()
-    # 1) Apply filters first (country/year/keyword)
     filt = df.copy()
     if w_countries.value:
         filt = filt[filt["country"].isin(w_countries.value)]
@@ -106,30 +121,23 @@ def semantic_search(event=None):
             filt["question_code"].astype(str).str.contains(w_keyword.value, case=False, na=False)
         ]
-    # 2) If no semantic query, just show the filtered data (no Score column)
     if not query:
-        if filt.empty:
-            result_table.value = pd.DataFrame(columns=["country", "year", "question_text", "answer_text"])
-        else:
-            result_table.value = filt[["country", "year", "question_text", "answer_text"]]
         return
-    # 3) Otherwise, do semantic search *within* the filtered subset
     model, ids_list, emb_tensor = get_semantic_resources()
     filtered_ids = filt["id"].tolist()
     id_to_index = {id_: i for i, id_ in enumerate(ids_list)}
     filtered_indices = [id_to_index[id_] for id_ in filtered_ids if id_ in id_to_index]
     if not filtered_indices:
-        result_table.value = pd.DataFrame(columns=["Score", "country", "year", "question_text", "answer_text"])
         return
     filtered_embs = emb_tensor[filtered_indices]
     q_vec = model.encode(query, convert_to_tensor=True, device="cpu").cpu()
     sims = util.cos_sim(q_vec, filtered_embs)[0]
-    top_vals, top_idx = torch.topk(sims, k=50)
     top_filtered_ids = [filtered_ids[i] for i in top_idx.tolist()]
     sem_rows = filt[filt["id"].isin(top_filtered_ids)].copy()
@@ -137,7 +145,7 @@ def semantic_search(event=None):
     sem_rows["Score"] = sem_rows["id"].map(score_map)
     sem_rows = sem_rows.sort_values("Score", ascending=False)
-    result_table.value = sem_rows[["Score", "country", "year", "question_text", "answer_text"]]
 def clear_filters(event=None):
@@ -147,9 +155,18 @@ def clear_filters(event=None):
     w_semquery.value = ""
     result_table.value = df[["country", "year", "question_text", "answer_text"]].copy()
-w_search_button.on_click(semantic_search)
 w_clear_filters.on_click(clear_filters)
 # ──────────────────────────────────────────────────────────────────────
 # 5) Layout
 # ──────────────────────────────────────────────────────────────────────

 )
 # ──────────────────────────────────────────────────────────────────────
+# 4) Search Logic
 # ──────────────────────────────────────────────────────────────────────
+def _group_by_question(df_in: pd.DataFrame) -> pd.DataFrame:
+    if df_in.empty:
+        return pd.DataFrame(columns=["question_text", "Countries", "Years", "Sample Answers"])
+    tmp = df_in.copy()
+    tmp["year"] = tmp["year"].replace('', pd.NA)
+    grouped = (
+        tmp.groupby("question_text", dropna=False)
+        .agg({
+            "country": lambda x: sorted({v for v in x if pd.notna(v)}),
+            "year":    lambda x: sorted({str(v) for v in x if pd.notna(v)}),
+            "answer_text": lambda x: list(x.dropna())[:3],
+        })
+        .reset_index()
+        .rename(columns={"country": "Countries", "year": "Years", "answer_text": "Sample Answers"})
+    )
+    return grouped
+def search(event=None):
+    query = w_semquery.value.strip()
     filt = df.copy()
     if w_countries.value:
         filt = filt[filt["country"].isin(w_countries.value)]
             filt["question_code"].astype(str).str.contains(w_keyword.value, case=False, na=False)
         ]
     if not query:
+        result_table.value = _group_by_question(filt) if w_group.value else filt[["country", "year", "question_text", "answer_text"]]
         return
     model, ids_list, emb_tensor = get_semantic_resources()
     filtered_ids = filt["id"].tolist()
     id_to_index = {id_: i for i, id_ in enumerate(ids_list)}
     filtered_indices = [id_to_index[id_] for id_ in filtered_ids if id_ in id_to_index]
     if not filtered_indices:
+        result_table.value = _group_by_question(filt.iloc[0:0]) if w_group.value else pd.DataFrame(columns=["Score", "country", "year", "question_text", "answer_text"])
         return
     filtered_embs = emb_tensor[filtered_indices]
     q_vec = model.encode(query, convert_to_tensor=True, device="cpu").cpu()
     sims = util.cos_sim(q_vec, filtered_embs)[0]
+    top_k = min(50, len(filtered_indices))
+    top_vals, top_idx = torch.topk(sims, k=top_k)
     top_filtered_ids = [filtered_ids[i] for i in top_idx.tolist()]
     sem_rows = filt[filt["id"].isin(top_filtered_ids)].copy()
     sem_rows["Score"] = sem_rows["id"].map(score_map)
     sem_rows = sem_rows.sort_values("Score", ascending=False)
+    result_table.value = _group_by_question(sem_rows.drop(columns=["Score"])) if w_group.value else sem_rows[["Score", "country", "year", "question_text", "answer_text"]]
 def clear_filters(event=None):
     w_semquery.value = ""
     result_table.value = df[["country", "year", "question_text", "answer_text"]].copy()
+w_search_button.on_click(search)
 w_clear_filters.on_click(clear_filters)
+# Live updates for filters (except semantic query and keyword)
+w_group.param.watch(lambda e: search(), 'value')
+w_countries.param.watch(lambda e: search(), 'value')
+w_years.param.watch(lambda e: search(), 'value')
+# Allow pressing Enter in semantic query or keyword to trigger search
+w_semquery.param.watch(lambda e: search(), 'enter_pressed')
+w_keyword.param.watch(lambda e: search(), 'enter_pressed')
 # ──────────────────────────────────────────────────────────────────────
 # 5) Layout
 # ──────────────────────────────────────────────────────────────────────