Spaces:

dolphinium
/

pc-ai-data-analyst-v2

Running

App Files Files Community

dolphinium commited on 23 days ago

Commit

bba916d

1 Parent(s): f3249d4

enhance vis generation code prompt.

Browse files

Files changed (1) hide show

llm_prompts.py +66 -127

llm_prompts.py CHANGED Viewed

@@ -214,7 +214,7 @@ This is the most critical part of your task. A bad choice leads to a useless, bo
       "deal_values_by_route": {{
         "type": "terms",
         "field": "route_branch",
-        "limit": 10,
         "sort": "total_deal_value desc",
         "facet": {{
           "total_deal_value": "sum(total_deal_value_in_million)"
@@ -239,7 +239,6 @@ Convert the following user query into a single, raw JSON "Analysis Plan" object.
 **Current User Query:** `{natural_language_query}`
 """
-# The other prompt functions remain unchanged.
 def get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan):
     """
     Generates the prompt for synthesizing a final report from the query results.
@@ -277,7 +276,7 @@ This data shows the high-level aggregates.
 {json.dumps(quantitative_data, indent=2)}
 ```
-**3. Qualitative Data (The 'Why'):
 These are the single most significant documents driving the numbers for each category.
 {qualitative_prompt_str}
@@ -307,117 +306,53 @@ Your report must be in clean, professional Markdown and follow this structure pr
 def get_visualization_code_prompt(query_context, facet_data):
     """
-    Generates the prompt for creating Python visualization code.
     """
     return f"""
-You are a Python Data Visualization expert specializing in Matplotlib and Seaborn.
-Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data.
-**User's Analytical Goal:**
-\"{query_context}\"
-**Aggregated Data (from Solr Facets):**
 ```json
 {json.dumps(facet_data, indent=2)}
 ```
 ---
 ### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES**
-You MUST follow these rules to avoid errors.
-**1. Identify the Data Structure FIRST:**
-Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below.
-   *   **Pattern A: Simple `terms` Facet.** The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts.
-   *   **Pattern B: Multiple `query` Facets.** The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection").
-   *   **Pattern C: Nested `terms` Facet.** The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`.
-**2. Use the Correct Parsing Template:**
----
-**TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):**
-```python
-import matplotlib.pyplot as plt
-import seaborn as sns
-import pandas as pd
-plt.style.use('seaborn-v0_8-whitegrid')
-fig, ax = plt.subplots(figsize=(12, 8))
-# Dynamically find the main facet key (the one with 'buckets')
-facet_key = None
-for key, value in facet_data.items():
-    if isinstance(value, dict) and 'buckets' in value:
-        facet_key = key
-        break
-if facet_key:
-    buckets = facet_data[facet_key].get('buckets', [])
-    # Check if buckets contain data
-    if buckets:
-        df = pd.DataFrame(buckets)
-        # Check for a nested metric or use 'count'
-        if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc):
-             # Example for nested sum metric
-             df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0))
-             y_axis_label = 'Sum of Total Deal Value'
-        else:
-             df.rename(columns={{'count': 'value'}}, inplace=True)
-             y_axis_label = 'Count'
-        sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis')
-        ax.set_xlabel('Category')
-        ax.set_ylabel(y_axis_label)
-    else:
-        ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')
-ax.set_title('Your Insightful Title Here')
-# Correct way to rotate labels to prevent errors
-plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
-plt.tight_layout()
-```
 ---
-**TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):**
-```python
-import matplotlib.pyplot as plt
-import seaborn as sns
-import pandas as pd
-plt.style.use('seaborn-v0_8-whitegrid')
-fig, ax = plt.subplots(figsize=(10, 6))
-labels = []
-values = []
-# Iterate through top-level keys, skipping the 'count'
-for key, data_dict in facet_data.items():
-    if key == 'count' or not isinstance(data_dict, dict):
-        continue
-    # Extract the label (e.g., 'oral_deals' -> 'Oral')
-    label = key.replace('_deals', '').replace('_', ' ').title()
-    # Find the metric value, which is NOT 'count'
-    metric_value = 0
-    for sub_key, sub_value in data_dict.items():
-        if sub_key != 'count':
-            metric_value = sub_value
-            break # Found the metric
-    labels.append(label)
-    values.append(metric_value)
-if labels:
-    sns.barplot(x=labels, y=values, ax=ax, palette='mako')
-    ax.set_ylabel('Total Deal Value') # Or other metric name
-    ax.set_xlabel('Category')
-else:
-    ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center')
-ax.set_title('Your Insightful Title Here')
-plt.tight_layout()
-```
----
-**TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):**
 ```python
 import matplotlib.pyplot as plt
 import seaborn as sns
 import pandas as pd
@@ -425,48 +360,52 @@ import pandas as pd
 plt.style.use('seaborn-v0_8-whitegrid')
 fig, ax = plt.subplots(figsize=(14, 8))
-# Find the key that has the buckets
 facet_key = None
 for key, value in facet_data.items():
     if isinstance(value, dict) and 'buckets' in value:
         facet_key = key
         break
 if facet_key and facet_data[facet_key].get('buckets'):
-    # This list comprehension is robust for parsing nested metrics
-    plot_data = []
     for bucket in facet_data[facet_key]['buckets']:
-        category = bucket['val']
-        # Find all nested metrics (e.g., total_deal_value_2025)
         for sub_key, sub_value in bucket.items():
             if isinstance(sub_value, dict) and 'sum' in sub_value:
-                # Extracts year from 'total_deal_value_2025' -> '2025'
-                year = sub_key.split('_')[-1]
                 value = sub_value['sum']
-                plot_data.append({{'Category': category, 'Year': year, 'Value': value}})
-    if plot_data:
-        df = pd.DataFrame(plot_data)
-        sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax)
-        ax.set_ylabel('Total Deal Value')
-        ax.set_xlabel('Business Model')
-        # Correct way to rotate labels to prevent errors
-        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
-    else:
-        ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center')
 else:
-    ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')
-ax.set_title('Your Insightful Title Here')
 plt.tight_layout()
 ```
 ---
-**3. Final Code Generation:**
-- **DO NOT** include `plt.show()`.
-- **DO** set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`.
-- **DO NOT** wrap the code in ```python ... ```. Output only the raw Python code.
-- Adapt the chosen template to the specific keys and metrics in the provided `facet_data`.
-**Your Task:**
-Now, generate the Python code.
 """

       "deal_values_by_route": {{
         "type": "terms",
         "field": "route_branch",
+        "limit": 2,
         "sort": "total_deal_value desc",
         "facet": {{
           "total_deal_value": "sum(total_deal_value_in_million)"
 **Current User Query:** `{natural_language_query}`
 """
 def get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan):
     """
     Generates the prompt for synthesizing a final report from the query results.
 {json.dumps(quantitative_data, indent=2)}
 ```
+**3. Qualitative Data (The 'Why'):**
 These are the single most significant documents driving the numbers for each category.
 {qualitative_prompt_str}
 def get_visualization_code_prompt(query_context, facet_data):
     """
+    Generates a flexible prompt for creating Python visualization code.
     """
     return f"""
+You are a world-class Python data visualization expert specializing in Matplotlib and Seaborn.
+Your primary task is to generate a single, insightful, and robust Python script to visualize the provided data. The visualization should directly answer the user's analytical goal.
+**1. User's Analytical Goal:**
+"{query_context}"
+**2. Aggregated Data (from Solr Facets):**
 ```json
 {json.dumps(facet_data, indent=2)}
 ```
 ---
 ### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES**
+You MUST follow these rules meticulously to ensure the code runs without errors in a server environment.
+**A. Analyze the Data & Choose the Right Chart:**
+- **Inspect the Data:** Before writing any code, carefully examine the structure of the `facet_data` JSON. Is it a simple list of categories and counts? Is it a nested structure comparing metrics across categories? Is it a time-series?
+- **Select the Best Chart Type:** Based on the data and the user's goal, choose the most effective chart.
+    - **Bar Chart:** Ideal for comparing quantities across different categories (e.g., top companies by deal value).
+    - **Grouped Bar Chart:** Use when comparing a metric across categories for a few groups (e.g., deal values for 2023 vs. 2024 by company).
+    - **Line Chart:** Best for showing a trend over time (e.g., number of approvals per year).
+    - **Pie Chart:** Use ONLY for showing parts of a whole, and only with a few (2-5) categories. Generally, bar charts are better.
+- **Tell a Story:** Your visualization should be more than just a plot; it should reveal the key insight from the data.
+- **Direct Answer** If user ask for like this: compare x with y there should be a comparison visualization between x and y nothing more.
+**B. Non-Negotiable Code Requirements:**
+1.  **Imports:** You must import `matplotlib.pyplot as plt`, `seaborn as sns`, and `pandas as pd`.
+2.  **Use Pandas:** ALWAYS parse the `facet_data` into a pandas DataFrame. This is more robust and flexible than iterating through dictionaries directly.
+3.  **Figure and Axes:** Use `fig, ax = plt.subplots()` to create the figure and axes objects. This gives you better control.
+4.  **Styling:** Apply a clean and professional style, for example: `plt.style.use('seaborn-v0_8-whitegrid')` and use a suitable Seaborn palette (e.g., `palette='viridis'`).
+5.  **NO `plt.show()`:** Your code will be run on a server. **DO NOT** include `plt.show()`.
+6.  **Save the Figure:** The execution environment expects a Matplotlib figure object named `fig`. Your code does not need to handle the saving path directly, but it **MUST** produce the final `fig` object correctly. The calling function will handle saving it.
+7.  **Titles and Labels:** You MUST set a clear and descriptive title and labels for the x and y axes. The title should reflect the user's query.
+8.  **Axis Label Readability:** If x-axis labels are long, you MUST rotate them to prevent overlap. Use this robust method: `plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")`.
+9.  **Layout:** Use `plt.tight_layout()` at the end to ensure all elements fit nicely.
+10. **Error Handling:** Your code should be robust. If the `facet_data` contains no "buckets" or data to plot, the code should not crash. It should instead produce a plot with a message like "No data available to plot."
 ---
+### **High-Quality Example (Grouped Bar Chart)**
+This example shows how to parse a nested facet structure into a DataFrame and create an insightful grouped bar chart. Adapt its principles to your specific task.
 ```python
+# --- Imports and Style ---
 import matplotlib.pyplot as plt
 import seaborn as sns
 import pandas as pd
 plt.style.use('seaborn-v0_8-whitegrid')
 fig, ax = plt.subplots(figsize=(14, 8))
+# --- Data Parsing ---
+# Dynamically find the main facet key (the one with 'buckets')
 facet_key = None
 for key, value in facet_data.items():
     if isinstance(value, dict) and 'buckets' in value:
         facet_key = key
         break
+plot_data = []
+# Check if a valid key and buckets were found
 if facet_key and facet_data[facet_key].get('buckets'):
+    # This robustly parses nested metrics (e.g., a sum for each year)
     for bucket in facet_data[facet_key]['buckets']:
+        category = bucket.get('val', 'N/A')
+        # Find all nested metrics inside the bucket
         for sub_key, sub_value in bucket.items():
             if isinstance(sub_value, dict) and 'sum' in sub_value:
+                # Extracts '2025' from a key like 'total_value_2025'
+                group = sub_key.split('_')[-1]
                 value = sub_value['sum']
+                plot_data.append({{'Category': category, 'Group': group, 'Value': value}})
+# --- Plotting ---
+if plot_data:
+    df = pd.DataFrame(plot_data)
+    sns.barplot(data=df, x='Category', y='Value', hue='Group', ax=ax, palette='viridis')
+    # --- Labels and Titles ---
+    ax.set_title('Comparison of Total Value by Category and Group')
+    ax.set_xlabel('Category')
+    ax.set_ylabel('Total Value')
+    # --- Formatting ---
+    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
 else:
+    # --- Handle No Data ---
+    ax.text(0.5, 0.5, 'No data available to plot.', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)
+    ax.set_title('Data Visualization')
+# --- Final Layout ---
 plt.tight_layout()
 ```
 ---
+### **Your Task:**
+Now, generate the raw Python code to create the best possible visualization for the user's goal based on the provided data.
+Do not wrap the code in ```python ... ```.
 """