gigiliu12 commited on
Commit
ecd8944
Β·
verified Β·
1 Parent(s): 6969959
Files changed (1) hide show
  1. app.py +35 -25
app.py CHANGED
@@ -48,6 +48,11 @@ row_lookup = {row.id: i for i, row in df.iterrows()}
48
  # 2) Load embeddings + ids once per session (S3) – new, cached
49
  # ────────────────────────────────────────────────────────────────────────
50
  @st.cache_resource
 
 
 
 
 
51
  def load_embeddings():
52
  # credentials already in env (HF secrets) – boto3 will pick them up
53
  BUCKET = "cgd-embeddings-bucket"
@@ -89,31 +94,36 @@ st.sidebar.subheader("🧠 Semantic Search")
89
  sem_query = st.sidebar.text_input("Enter a natural-language query")
90
  if st.sidebar.button("Search", disabled=not sem_query.strip()):
91
  with st.spinner("Embedding & searching…"):
92
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
93
- q_vec = model.encode(sem_query.strip(), convert_to_tensor=True).cpu()
94
- scores = util.cos_sim(q_vec, emb_tensor)[0]
95
- top_vals, top_idx = torch.topk(scores, k=10) # grab extra
96
-
97
- results = []
98
- for score, emb_row in zip(top_vals.tolist(), top_idx.tolist()):
99
- db_id = ids_list[emb_row]
100
- if db_id in row_lookup:
101
- row = df.iloc[row_lookup[db_id]]
102
- if row["question_text"] and row["answer_text"]:
103
- results.append({
104
- "Score": f"{score:.3f}",
105
- "Country": row["country"],
106
- "Year": row["year"],
107
- "Question": row["question_text"],
108
- "Answer": row["answer_text"],
109
- })
110
- if results:
111
- st.subheader(f"πŸ” Semantic Results ({len(results)} found)")
112
- st.dataframe(pd.DataFrame(results).head(5))
113
- else:
114
- st.info("No semantic matches found.")
115
-
116
- st.markdown("---")
 
 
 
 
 
117
 
118
  # ── apply original filters ──────────────────────────────────────────────
119
  filtered = df[
 
48
  # 2) Load embeddings + ids once per session (S3) – new, cached
49
  # ────────────────────────────────────────────────────────────────────────
50
  @st.cache_resource
51
+ def get_st_model():
52
+ return SentenceTransformer(
53
+ "sentence-transformers/all-MiniLM-L6-v2",
54
+ device="cpu",
55
+ )
56
  def load_embeddings():
57
  # credentials already in env (HF secrets) – boto3 will pick them up
58
  BUCKET = "cgd-embeddings-bucket"
 
94
  sem_query = st.sidebar.text_input("Enter a natural-language query")
95
  if st.sidebar.button("Search", disabled=not sem_query.strip()):
96
  with st.spinner("Embedding & searching…"):
97
+ # 1) embed query
98
+ model = get_st_model() # cached CPU model
99
+ q_vec = model.encode(
100
+ sem_query.strip(),
101
+ convert_to_tensor=True,
102
+ device="cpu"
103
+ ).cpu()
104
+
105
+ # 2) semantic similarity
106
+ sims = util.cos_sim(q_vec, emb_tensor)[0]
107
+ top_vals, top_idx = torch.topk(sims, k=50)
108
+
109
+ sem_ids = [ids_list[i] for i in top_idx.tolist()]
110
+ sem_rows = df.loc[df["id"].isin(sem_ids)].copy()
111
+ score_map = dict(zip(sem_ids, top_vals.tolist()))
112
+ sem_rows["Score"] = sem_rows["id"].map(score_map)
113
+ sem_rows = sem_rows.sort_values("Score", ascending=False)
114
+
115
+ # 3) keyword / dropdown remainder
116
+ remainder = filtered.loc[~filtered["id"].isin(sem_ids)].copy()
117
+ remainder["Score"] = "" # blank for keyword-only rows
118
+
119
+ combined = pd.concat([sem_rows, remainder], ignore_index=True)
120
+
121
+ st.subheader(f"πŸ” Combined Results ({len(combined)})")
122
+ st.dataframe(
123
+ combined[["Score", "country", "year", "question_text", "answer_text"]],
124
+ use_container_width=True,
125
+ )
126
+ st.stop() # skip the old display logic below when semantic search ran
127
 
128
  # ── apply original filters ──────────────────────────────────────────────
129
  filtered = df[