Spaces:

dolphinium
/

pc-ai-data-analyst-v2

Running

App Files Files Community

dolphinium commited on 27 days ago

Commit

f74c067

1 Parent(s): a4b0896

feat: Integrate dynamic field suggestions from external API into analysis plan generation and UI

Browse files

Files changed (4) hide show

data_processing.py +23 -6
extract_results.py +28 -0
llm_prompts.py +25 -3
ui.py +32 -17

data_processing.py CHANGED Viewed

@@ -26,6 +26,8 @@ from llm_prompts import (
     get_synthesis_report_prompt,
     get_visualization_code_prompt
 )
 def parse_suggestions_from_report(report_text):
     """Extracts numbered suggestions from the report's markdown text."""
@@ -35,20 +37,36 @@ def parse_suggestions_from_report(report_text):
     suggestions = re.findall(r"^\s*\d+\.\s*(.*)", suggestions_text, re.MULTILINE)
     return [s.strip() for s in suggestions]
 def llm_generate_analysis_plan_with_history(llm_model, natural_language_query, chat_history):
     """
-    Generates a complete analysis plan from a user query, considering chat history.
     """
-    prompt = get_analysis_plan_prompt(natural_language_query, chat_history)
     try:
         response = llm_model.generate_content(prompt)
         cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip()
         plan = json.loads(cleaned_text)
-        return plan
     except Exception as e:
         raw_response_text = response.text if 'response' in locals() else 'N/A'
         print(f"Error in llm_generate_analysis_plan_with_history: {e}\nRaw Response:\n{raw_response_text}")
-        return None
 def execute_quantitative_query(solr_client, plan):
     """Executes the facet query to get aggregate data."""
@@ -126,5 +144,4 @@ def execute_viz_code_and_get_path(viz_code, facet_data):
         return None
     except Exception as e:
         print(f"ERROR executing visualization code: {e}\n---Code---\n{viz_code}")
-        return None

     get_synthesis_report_prompt,
     get_visualization_code_prompt
 )
+from extract_results import get_search_list_params
 def parse_suggestions_from_report(report_text):
     """Extracts numbered suggestions from the report's markdown text."""
     suggestions = re.findall(r"^\s*\d+\.\s*(.*)", suggestions_text, re.MULTILINE)
     return [s.strip() for s in suggestions]
 def llm_generate_analysis_plan_with_history(llm_model, natural_language_query, chat_history):
     """
+    Generates a complete analysis plan from a user query, considering chat history
+    and dynamic field suggestions from an external API.
     """
+    search_fields, search_name = [], ""
+    try:
+        # Call the external API to get dynamic field suggestions
+        search_fields, search_name = get_search_list_params(natural_language_query)
+        print(f"Successfully retrieved {len(search_fields)} dynamic fields.")
+    except Exception as e:
+        print(f"Warning: Could not retrieve dynamic search fields. Proceeding without them. Error: {e}")
+    # Generate the prompt, including the (potentially empty) search_fields
+    prompt = get_analysis_plan_prompt(natural_language_query, chat_history, search_fields)
     try:
         response = llm_model.generate_content(prompt)
         cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip()
         plan = json.loads(cleaned_text)
+        # Return the plan and the retrieved fields for UI display
+        return plan, search_fields
     except Exception as e:
         raw_response_text = response.text if 'response' in locals() else 'N/A'
         print(f"Error in llm_generate_analysis_plan_with_history: {e}\nRaw Response:\n{raw_response_text}")
+        # Return None for the plan but still return search_fields for debugging in the UI
+        return None, search_fields
 def execute_quantitative_query(solr_client, plan):
     """Executes the facet query to get aggregate data."""
         return None
     except Exception as e:
         print(f"ERROR executing visualization code: {e}\n---Code---\n{viz_code}")
+        return None

extract_results.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import requests
+import json
+import yaml
+def get_search_list_params(query, k=20):
+    """
+    Returns tuple: (search_fields, search_name)
+    """
+    url = "https://aitest.ebalina.com/stream"
+    response = requests.post(url,
+                           headers={'Content-Type': 'application/json'},
+                           json={"query": query, "k": k},
+                           stream=True)
+    for line in response.iter_lines():
+        if line and line.startswith(b'data: '):
+            try:
+                data = json.loads(line[6:])
+                if data.get('log_title') == 'Search List Result':
+                    yaml_data = yaml.safe_load(data['content'])
+                    # As requested, ignoring 'search_name' and only returning fields
+                    return yaml_data.get('search_fields', []), yaml_data.get('search_name', '')
+            except:
+                continue
+    # Return empty list if no valid data is found
+    return [], ""

llm_prompts.py CHANGED Viewed

@@ -11,9 +11,13 @@ import datetime
 import json
 from solr_metadata import format_metadata_for_prompt
-def get_analysis_plan_prompt(natural_language_query, chat_history):
     """
     Generates the prompt for creating a Solr analysis plan from a user query.
     """
     formatted_field_info = format_metadata_for_prompt()
     formatted_history = ""
@@ -21,6 +25,22 @@ def get_analysis_plan_prompt(natural_language_query, chat_history):
         if user_msg:
             formatted_history += f"- User: \"{user_msg}\"\n"
     return f"""
 You are an expert data analyst and Solr query engineer. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping).
@@ -43,6 +63,8 @@ You are an expert data analyst and Solr query engineer. Your task is to convert
 ### FIELD DEFINITIONS (Your Source of Truth)
 {formatted_field_info}
 ---
 ### CHAT HISTORY
 {formatted_history}
@@ -113,7 +135,7 @@ Convert the following user query into a single, raw JSON "Analysis Plan" object,
 **Current User Query:** `{natural_language_query}`
 """
 def get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan):
     """
     Generates the prompt for synthesizing a final report from the query results.
@@ -343,4 +365,4 @@ plt.tight_layout()
 **Your Task:**
 Now, generate the Python code.
-"""

 import json
 from solr_metadata import format_metadata_for_prompt
+def get_analysis_plan_prompt(natural_language_query, chat_history, search_fields=None):
     """
     Generates the prompt for creating a Solr analysis plan from a user query.
+    Args:
+        natural_language_query (str): The user's query.
+        chat_history (list): A list of previous user and bot messages.
+        search_fields (list, optional): A list of dictionaries with 'field_name' and 'field_value'.
     """
     formatted_field_info = format_metadata_for_prompt()
     formatted_history = ""
         if user_msg:
             formatted_history += f"- User: \"{user_msg}\"\n"
+    dynamic_fields_prompt_section = ""
+    if search_fields:
+        formatted_fields = "\n".join([f"  - {field['field_name']}: {field['field_value']}" for field in search_fields])
+        dynamic_fields_prompt_section = f"""
+---
+### DYNAMIC FIELD SUGGESTIONS (Use Critically)
+An external API has suggested the following field-value pairs based on your query.
+**These are only HINTS.** Do NOT use them blindly.
+Critically evaluate if they make sense. For example, a `molecule_name` associated with a `company_name` might be irrelevant or illogical.
+Use only what is logical for the query. Do not construct filters from fields/values that do not make sense.
+**Suggested Fields:**
+{formatted_fields}
+"""
     return f"""
 You are an expert data analyst and Solr query engineer. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping).
 ### FIELD DEFINITIONS (Your Source of Truth)
 {formatted_field_info}
+{dynamic_fields_prompt_section}
 ---
 ### CHAT HISTORY
 {formatted_history}
 **Current User Query:** `{natural_language_query}`
 """
+# The other prompt functions remain unchanged.
 def get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan):
     """
     Generates the prompt for synthesizing a final report from the query results.
 **Your Task:**
 Now, generate the Python code.
+"""

ui.py CHANGED Viewed

@@ -46,6 +46,8 @@ def create_ui(llm_model, solr_client):
                 msg_textbox = gr.Textbox(placeholder="Ask a question, e.g., 'Show me the top 5 companies by total deal value in 2023'", label="Your Question", interactive=True)
             with gr.Column(scale=2):
                 with gr.Accordion("Generated Analysis Plan", open=False):
                     plan_display = gr.Markdown("Plan will appear here...", visible=True)
                 with gr.Accordion("Retrieved Quantitative Data", open=False):
@@ -64,21 +66,31 @@ def create_ui(llm_model, solr_client):
             if history is None:
                 history = []
-            yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False))
             query_context = user_input.strip()
             if not query_context:
                 history.append((user_input, "Please enter a question to analyze."))
-                yield (history, state, None, None, None, None, None)
                 return
             history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating analysis plan...*"))
-            yield (history, state, None, None, None, None, None)
-            analysis_plan = llm_generate_analysis_plan_with_history(llm_model, query_context, history)
             if not analysis_plan:
                 history.append((None, "I'm sorry, I couldn't generate a valid analysis plan. Please try rephrasing."))
-                yield (history, state, None, None, None, None, None)
                 return
             history.append((None, "✅ Analysis plan generated!"))
@@ -89,11 +101,12 @@ def create_ui(llm_model, solr_client):
 """
             history.append((None, plan_summary))
             formatted_plan = f"**Full Analysis Plan:**\n```json\n{json.dumps(analysis_plan, indent=2)}\n```"
-            yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
             history.append((None, "*Executing queries for aggregates and examples...*"))
-            yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
             aggregate_data = None
             example_data = None
             with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -104,17 +117,19 @@ def create_ui(llm_model, solr_client):
             if not aggregate_data or aggregate_data.get('count', 0) == 0:
                 history.append((None, "No data was found for your query. Please try a different question."))
-                yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
                 return
             formatted_agg_data = f"**Quantitative (Aggregate) Data:**\n```json\n{json.dumps(aggregate_data, indent=2)}\n```"
             formatted_qual_data = f"**Qualitative (Example) Data:**\n```json\n{json.dumps(example_data, indent=2)}\n```"
             qual_data_display_update = gr.update(value=formatted_qual_data, visible=True)
-            yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
             history.append((None, "✅ Data retrieved. Generating visualization and final report..."))
-            yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
             with concurrent.futures.ThreadPoolExecutor() as executor:
                 viz_future = executor.submit(llm_generate_visualization_code, llm_model, query_context, aggregate_data)
@@ -122,7 +137,7 @@ def create_ui(llm_model, solr_client):
                 stream_history = history[:]
                 for chunk in llm_synthesize_enriched_report_stream(llm_model, query_context, aggregate_data, example_data, analysis_plan):
                     report_text += chunk
-                    yield (stream_history, state, None, gr.update(value=report_text, visible=True), gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
                 history.append((None, report_text))
@@ -132,13 +147,13 @@ def create_ui(llm_model, solr_client):
                 if not plot_path:
                     history.append((None, "*I was unable to generate a plot for this data.*\n"))
-                yield (history, state, output_plot, report_text, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
             state['query_count'] += 1
             state['last_suggestions'] = parse_suggestions_from_report(report_text)
             next_prompt = "Analysis complete. What would you like to explore next?"
             history.append((None, next_prompt))
-            yield (history, state, output_plot, report_text, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
         def reset_all():
             """Resets the entire UI for a new analysis session."""
@@ -150,13 +165,14 @@ def create_ui(llm_model, solr_client):
                 gr.update(value=None, visible=False),
                 gr.update(value=None, visible=False),
                 gr.update(value=None, visible=False),
                 gr.update(value=None, visible=False)
             )
         msg_textbox.submit(
             fn=process_analysis_flow,
             inputs=[msg_textbox, chatbot, state],
-            outputs=[chatbot, state, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display],
         ).then(
             lambda: gr.update(value=""),
             None,
@@ -167,9 +183,8 @@ def create_ui(llm_model, solr_client):
         clear_button.click(
             fn=reset_all,
             inputs=None,
-            outputs=[chatbot, state, msg_textbox, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display],
             queue=False
         )
-    return demo

                 msg_textbox = gr.Textbox(placeholder="Ask a question, e.g., 'Show me the top 5 companies by total deal value in 2023'", label="Your Question", interactive=True)
             with gr.Column(scale=2):
+                with gr.Accordion("Dynamic Field Suggestions", open=False):
+                    suggestions_display = gr.Markdown("Suggestions from the external API will appear here...", visible=True)
                 with gr.Accordion("Generated Analysis Plan", open=False):
                     plan_display = gr.Markdown("Plan will appear here...", visible=True)
                 with gr.Accordion("Retrieved Quantitative Data", open=False):
             if history is None:
                 history = []
+            # Reset all displays at the beginning of a new flow
+            yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value="Suggestions from the external API will appear here...", visible=False))
             query_context = user_input.strip()
             if not query_context:
                 history.append((user_input, "Please enter a question to analyze."))
+                yield (history, state, None, None, None, None, None, None)
                 return
             history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating analysis plan...*"))
+            yield (history, state, None, None, None, None, None, None)
+            # Generate plan and get search field suggestions
+            analysis_plan, search_fields = llm_generate_analysis_plan_with_history(llm_model, query_context, history)
+            # Update and display search field suggestions in its own accordion
+            if search_fields:
+                suggestions_md = "**External API Suggestions:**\n" + "\n".join([f"- `{field['field_name']}`: `{field['field_value']}`" for field in search_fields])
+                suggestions_display_update = gr.update(value=suggestions_md, visible=True)
+            else:
+                suggestions_display_update = gr.update(value="No suggestions were returned from the external API.", visible=True)
             if not analysis_plan:
                 history.append((None, "I'm sorry, I couldn't generate a valid analysis plan. Please try rephrasing."))
+                yield (history, state, None, None, None, None, None, suggestions_display_update)
                 return
             history.append((None, "✅ Analysis plan generated!"))
 """
             history.append((None, plan_summary))
             formatted_plan = f"**Full Analysis Plan:**\n```json\n{json.dumps(analysis_plan, indent=2)}\n```"
+            yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, suggestions_display_update)
             history.append((None, "*Executing queries for aggregates and examples...*"))
+            yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, suggestions_display_update)
+            # Execute queries in parallel
             aggregate_data = None
             example_data = None
             with concurrent.futures.ThreadPoolExecutor() as executor:
             if not aggregate_data or aggregate_data.get('count', 0) == 0:
                 history.append((None, "No data was found for your query. Please try a different question."))
+                yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, suggestions_display_update)
                 return
+            # Display retrieved data
             formatted_agg_data = f"**Quantitative (Aggregate) Data:**\n```json\n{json.dumps(aggregate_data, indent=2)}\n```"
             formatted_qual_data = f"**Qualitative (Example) Data:**\n```json\n{json.dumps(example_data, indent=2)}\n```"
             qual_data_display_update = gr.update(value=formatted_qual_data, visible=True)
+            yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update, suggestions_display_update)
             history.append((None, "✅ Data retrieved. Generating visualization and final report..."))
+            yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update, suggestions_display_update)
+            # Generate viz and report
             with concurrent.futures.ThreadPoolExecutor() as executor:
                 viz_future = executor.submit(llm_generate_visualization_code, llm_model, query_context, aggregate_data)
                 stream_history = history[:]
                 for chunk in llm_synthesize_enriched_report_stream(llm_model, query_context, aggregate_data, example_data, analysis_plan):
                     report_text += chunk
+                    yield (stream_history, state, None, gr.update(value=report_text, visible=True), gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update, suggestions_display_update)
                 history.append((None, report_text))
                 if not plot_path:
                     history.append((None, "*I was unable to generate a plot for this data.*\n"))
+                yield (history, state, output_plot, gr.update(value=report_text), gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update, suggestions_display_update)
             state['query_count'] += 1
             state['last_suggestions'] = parse_suggestions_from_report(report_text)
             next_prompt = "Analysis complete. What would you like to explore next?"
             history.append((None, next_prompt))
+            yield (history, state, output_plot, gr.update(value=report_text), gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update, suggestions_display_update)
         def reset_all():
             """Resets the entire UI for a new analysis session."""
                 gr.update(value=None, visible=False),
                 gr.update(value=None, visible=False),
                 gr.update(value=None, visible=False),
+                gr.update(value=None, visible=False),
                 gr.update(value=None, visible=False)
             )
         msg_textbox.submit(
             fn=process_analysis_flow,
             inputs=[msg_textbox, chatbot, state],
+            outputs=[chatbot, state, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display, suggestions_display],
         ).then(
             lambda: gr.update(value=""),
             None,
         clear_button.click(
             fn=reset_all,
             inputs=None,
+            outputs=[chatbot, state, msg_textbox, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display, suggestions_display],
             queue=False
         )
+    return demo