Spaces:
Sleeping
Sleeping
updated
Browse files
app.py
CHANGED
@@ -48,6 +48,11 @@ row_lookup = {row.id: i for i, row in df.iterrows()}
|
|
48 |
# 2) Load embeddings + ids once per session (S3) β new, cached
|
49 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
50 |
@st.cache_resource
|
|
|
|
|
|
|
|
|
|
|
51 |
def load_embeddings():
|
52 |
# credentials already in env (HF secrets) β boto3 will pick them up
|
53 |
BUCKET = "cgd-embeddings-bucket"
|
@@ -89,31 +94,36 @@ st.sidebar.subheader("π§ Semantic Search")
|
|
89 |
sem_query = st.sidebar.text_input("Enter a natural-language query")
|
90 |
if st.sidebar.button("Search", disabled=not sem_query.strip()):
|
91 |
with st.spinner("Embedding & searchingβ¦"):
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
# ββ apply original filters ββββββββββββββββββββββββββββββββββββββββββββββ
|
119 |
filtered = df[
|
|
|
48 |
# 2) Load embeddings + ids once per session (S3) β new, cached
|
49 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
50 |
@st.cache_resource
|
51 |
+
def get_st_model():
|
52 |
+
return SentenceTransformer(
|
53 |
+
"sentence-transformers/all-MiniLM-L6-v2",
|
54 |
+
device="cpu",
|
55 |
+
)
|
56 |
def load_embeddings():
|
57 |
# credentials already in env (HF secrets) β boto3 will pick them up
|
58 |
BUCKET = "cgd-embeddings-bucket"
|
|
|
94 |
sem_query = st.sidebar.text_input("Enter a natural-language query")
|
95 |
if st.sidebar.button("Search", disabled=not sem_query.strip()):
|
96 |
with st.spinner("Embedding & searchingβ¦"):
|
97 |
+
# 1) embed query
|
98 |
+
model = get_st_model() # cached CPU model
|
99 |
+
q_vec = model.encode(
|
100 |
+
sem_query.strip(),
|
101 |
+
convert_to_tensor=True,
|
102 |
+
device="cpu"
|
103 |
+
).cpu()
|
104 |
+
|
105 |
+
# 2) semantic similarity
|
106 |
+
sims = util.cos_sim(q_vec, emb_tensor)[0]
|
107 |
+
top_vals, top_idx = torch.topk(sims, k=50)
|
108 |
+
|
109 |
+
sem_ids = [ids_list[i] for i in top_idx.tolist()]
|
110 |
+
sem_rows = df.loc[df["id"].isin(sem_ids)].copy()
|
111 |
+
score_map = dict(zip(sem_ids, top_vals.tolist()))
|
112 |
+
sem_rows["Score"] = sem_rows["id"].map(score_map)
|
113 |
+
sem_rows = sem_rows.sort_values("Score", ascending=False)
|
114 |
+
|
115 |
+
# 3) keyword / dropdown remainder
|
116 |
+
remainder = filtered.loc[~filtered["id"].isin(sem_ids)].copy()
|
117 |
+
remainder["Score"] = "" # blank for keyword-only rows
|
118 |
+
|
119 |
+
combined = pd.concat([sem_rows, remainder], ignore_index=True)
|
120 |
+
|
121 |
+
st.subheader(f"π Combined Results ({len(combined)})")
|
122 |
+
st.dataframe(
|
123 |
+
combined[["Score", "country", "year", "question_text", "answer_text"]],
|
124 |
+
use_container_width=True,
|
125 |
+
)
|
126 |
+
st.stop() # skip the old display logic below when semantic search ran
|
127 |
|
128 |
# ββ apply original filters ββββββββββββββββββββββββββββββββββββββββββββββ
|
129 |
filtered = df[
|