DrishtiSharma commited on
Commit
7ff7723
Β·
verified Β·
1 Parent(s): 7ff3268

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -12
app.py CHANGED
@@ -191,15 +191,99 @@ COLUMN_SYNONYMS = {
191
  }
192
 
193
 
194
- # Helper function to map user query terms to dataset columns
195
- #def map_query_to_column(query):
196
- # for col, synonyms in COLUMN_SYNONYMS.items():
197
- # for term in synonyms:
198
- # if term in query:
199
- # return col
200
- # return None
201
-
202
- def map_query_to_column(query):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  query = query.lower()
204
  all_synonyms = {synonym: col for col, synonyms in COLUMN_SYNONYMS.items() for synonym in synonyms}
205
  matches = get_close_matches(query, all_synonyms.keys(), n=1, cutoff=0.6)
@@ -210,10 +294,10 @@ def map_query_to_column(query):
210
  for col, synonyms in COLUMN_SYNONYMS.items():
211
  if any(term in query for term in synonyms):
212
  return col
213
- return None
214
 
215
 
216
- # Visualization generator with synonym handling
217
  def generate_visual_from_query(query, df):
218
  try:
219
  query = query.lower()
@@ -253,7 +337,7 @@ def generate_visual_from_query(query, df):
253
 
254
  except Exception as e:
255
  st.error(f"Error generating visualization: {e}")
256
- return None
257
 
258
  # SQL-RAG Analysis
259
  if st.session_state.df is not None:
 
191
  }
192
 
193
 
194
+ # Fuzzy match to map query terms to dataset columns
195
+ def fuzzy_match_columns(query, n=2):
196
+ query = query.lower()
197
+ all_synonyms = {synonym: col for col, synonyms in COLUMN_SYNONYMS.items() for synonym in synonyms}
198
+ words = query.replace("and", "").replace("vs", "").split() # Remove "and"/"vs" for better matching
199
+
200
+ matched_columns = []
201
+ for word in words:
202
+ matches = get_close_matches(word, all_synonyms.keys(), n=n, cutoff=0.6)
203
+ for match in matches:
204
+ matched_columns.append(all_synonyms[match])
205
+
206
+ # Remove duplicates while preserving order
207
+ matched_columns = list(dict.fromkeys(matched_columns))
208
+ return matched_columns
209
+
210
+ # Visualization generator with dynamic groupby handling
211
+ def generate_visual_from_query(query, df):
212
+ try:
213
+ # Step 1: Fuzzy match columns mentioned in the query
214
+ matched_columns = fuzzy_match_columns(query)
215
+
216
+ # Step 2: Detect groupby intent (handling "and", "vs", "by")
217
+ if "and" in query or "vs" in query or "by" in query or len(matched_columns) > 1:
218
+ if len(matched_columns) >= 2:
219
+ x_axis = matched_columns[0]
220
+ group_by = matched_columns[1]
221
+ else:
222
+ x_axis, group_by = matched_columns[0], None
223
+ else:
224
+ x_axis = matched_columns[0] if matched_columns else None
225
+ group_by = None
226
+
227
+ # Step 3: Visualization logic
228
+ if "distribution" in query and x_axis:
229
+ fig = px.box(df, x=x_axis, y="salary_in_usd", color=group_by,
230
+ title=f"Salary Distribution by {x_axis.replace('_', ' ').title()}"
231
+ + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
232
+ return fig
233
+
234
+ elif "average" in query or "mean" in query:
235
+ grouped_df = df.groupby([x_axis] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
236
+ fig = px.bar(grouped_df, x=x_axis, y="salary_in_usd", color=group_by,
237
+ barmode="group",
238
+ title=f"Average Salary by {x_axis.replace('_', ' ').title()}"
239
+ + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
240
+ return fig
241
+
242
+ elif "trend" in query and "work_year" in df.columns and x_axis:
243
+ grouped_df = df.groupby(["work_year", x_axis])["salary_in_usd"].mean().reset_index()
244
+ fig = px.line(grouped_df, x="work_year", y="salary_in_usd", color=x_axis,
245
+ title=f"Salary Trend over Years by {x_axis.replace('_', ' ').title()}")
246
+ return fig
247
+
248
+ elif "remote" in query:
249
+ grouped_df = df.groupby(["remote_ratio"] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
250
+ fig = px.bar(grouped_df, x="remote_ratio", y="salary_in_usd", color=group_by,
251
+ barmode="group", title="Remote Work Impact on Salary")
252
+ return fig
253
+
254
+ elif "company size" in query:
255
+ grouped_df = df.groupby(["company_size"] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
256
+ fig = px.bar(grouped_df, x="company_size", y="salary_in_usd", color=group_by,
257
+ title=f"Salary by Company Size"
258
+ + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
259
+ return fig
260
+
261
+ elif "country" in query or "location" in query:
262
+ grouped_df = df.groupby(["employee_residence"] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
263
+ fig = px.bar(grouped_df, x="employee_residence", y="salary_in_usd", color=group_by,
264
+ title=f"Salary by Employee Residence"
265
+ + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
266
+ return fig
267
+
268
+ else:
269
+ st.warning("❓ No suitable visualization detected. Please refine your query.")
270
+ return None
271
+
272
+ except Exception as e:
273
+ st.error(f"Error generating visualization: {e}")
274
+ return None
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+ """def map_query_to_column(query):
287
  query = query.lower()
288
  all_synonyms = {synonym: col for col, synonyms in COLUMN_SYNONYMS.items() for synonym in synonyms}
289
  matches = get_close_matches(query, all_synonyms.keys(), n=1, cutoff=0.6)
 
294
  for col, synonyms in COLUMN_SYNONYMS.items():
295
  if any(term in query for term in synonyms):
296
  return col
297
+ return None"""
298
 
299
 
300
+ """# Visualization generator with synonym handling
301
  def generate_visual_from_query(query, df):
302
  try:
303
  query = query.lower()
 
337
 
338
  except Exception as e:
339
  st.error(f"Error generating visualization: {e}")
340
+ return None"""
341
 
342
  # SQL-RAG Analysis
343
  if st.session_state.df is not None: