dolphinium
feat: Integrate dynamic field suggestions from external API into analysis plan generation and UI
f74c067
""" | |
Contains the prompt templates for interacting with the Gemini LLM. | |
Separating prompts from the application logic makes them easier to manage, | |
modify, and version. This module provides functions that return the formatted | |
prompt strings required by the data processing module. | |
""" | |
import datetime | |
import json | |
from solr_metadata import format_metadata_for_prompt | |
def get_analysis_plan_prompt(natural_language_query, chat_history, search_fields=None): | |
""" | |
Generates the prompt for creating a Solr analysis plan from a user query. | |
Args: | |
natural_language_query (str): The user's query. | |
chat_history (list): A list of previous user and bot messages. | |
search_fields (list, optional): A list of dictionaries with 'field_name' and 'field_value'. | |
""" | |
formatted_field_info = format_metadata_for_prompt() | |
formatted_history = "" | |
for user_msg, bot_msg in chat_history: | |
if user_msg: | |
formatted_history += f"- User: \"{user_msg}\"\n" | |
dynamic_fields_prompt_section = "" | |
if search_fields: | |
formatted_fields = "\n".join([f" - {field['field_name']}: {field['field_value']}" for field in search_fields]) | |
dynamic_fields_prompt_section = f""" | |
--- | |
### DYNAMIC FIELD SUGGESTIONS (Use Critically) | |
An external API has suggested the following field-value pairs based on your query. | |
**These are only HINTS.** Do NOT use them blindly. | |
Critically evaluate if they make sense. For example, a `molecule_name` associated with a `company_name` might be irrelevant or illogical. | |
Use only what is logical for the query. Do not construct filters from fields/values that do not make sense. | |
**Suggested Fields:** | |
{formatted_fields} | |
""" | |
return f""" | |
You are an expert data analyst and Solr query engineer. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping). | |
--- | |
### CONTEXT & RULES | |
1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")} | |
2. **Field Usage**: You MUST use the fields described in the 'Field Definitions'. Pay close attention to the definitions to select the correct field, especially the `_s` fields for searching. Do not use fields ending with `_s` in `group.field` or facet `field` unless necessary for the analysis. | |
3. **Dimension vs. Measure**: | |
* `analysis_dimension`: The primary categorical field the user wants to group by (e.g., `company_name`, `route_branch`). This is the `group by` field. | |
* `analysis_measure`: The metric to aggregate (e.g., `sum(total_deal_value_in_million)`) or the method of counting (`count`). | |
* `sort_field_for_examples`: The raw field used to find the "best" example. If `analysis_measure` is `sum(field)`, this should be `field`. If `analysis_measure` is `count`, this should be a relevant field like `date`. | |
4. **Crucial Sorting Rules**: | |
* For `group.sort`: If `analysis_measure` involves a function on a field (e.g., `sum(total_deal_value_in_million)`), you MUST use the full function: `group.sort: 'sum(total_deal_value_in_million) desc'`. | |
* If `analysis_measure` is 'count', you MUST OMIT the `group.sort` parameter entirely. | |
* For sorting, NEVER use 'date_year' directly for `sort` in `terms` facets; use 'index asc' or 'index desc' instead. For other sorts, use 'date'. | |
5. **Output Format**: Your final output must be a single, raw JSON object. Do not add comments or markdown formatting. | |
--- | |
### FIELD DEFINITIONS (Your Source of Truth) | |
{formatted_field_info} | |
{dynamic_fields_prompt_section} | |
--- | |
### CHAT HISTORY | |
{formatted_history} | |
--- | |
### EXAMPLES | |
**User Query 1:** "What are the top 5 companies by total deal value in 2023?" | |
**Correct JSON Output 1:** | |
```json | |
{{ | |
"analysis_dimension": "company_name", | |
"analysis_measure": "sum(total_deal_value_in_million)", | |
"sort_field_for_examples": "total_deal_value_in_million", | |
"query_filter": "date_year:2023 AND total_deal_value_in_million:[0 TO *]", | |
"quantitative_request": {{ | |
"json.facet": {{ | |
"companies_by_deal_value": {{ | |
"type": "terms", | |
"field": "company_name", | |
"limit": 5, | |
"sort": "total_value desc", | |
"facet": {{ | |
"total_value": "sum(total_deal_value_in_million)" | |
}} | |
}} | |
}} | |
}}, | |
"qualitative_request": {{ | |
"group": true, | |
"group.field": "company_name", | |
"group.limit": 1, | |
"group.sort": "sum(total_deal_value_in_million) desc", | |
"sort": "total_deal_value_in_million desc" | |
}} | |
}} | |
``` | |
**User Query 2:** "What are the most common news types for infections this year?" | |
**Correct JSON Output 2:** | |
```json | |
{{ | |
"analysis_dimension": "news_type", | |
"analysis_measure": "count", | |
"sort_field_for_examples": "date", | |
"query_filter": "therapeutic_category_s:infections AND date_year:{datetime.datetime.now().year}", | |
"quantitative_request": {{ | |
"json.facet": {{ | |
"news_by_type": {{ | |
"type": "terms", | |
"field": "news_type", | |
"limit": 10, | |
"sort": "count desc" | |
}} | |
}} | |
}}, | |
"qualitative_request": {{ | |
"group": true, | |
"group.field": "news_type", | |
"group.limit": 1, | |
"sort": "date desc" | |
}} | |
}} | |
``` | |
--- | |
### YOUR TASK | |
Convert the following user query into a single, raw JSON "Analysis Plan" object, strictly following all rules and considering the chat history. | |
**Current User Query:** `{natural_language_query}` | |
""" | |
# The other prompt functions remain unchanged. | |
def get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan): | |
""" | |
Generates the prompt for synthesizing a final report from the query results. | |
""" | |
qualitative_prompt_str = "" | |
dimension = plan.get('analysis_dimension', 'N/A') | |
if qualitative_data and dimension in qualitative_data: | |
for group in qualitative_data.get(dimension, {}).get('groups', []): | |
group_value = group.get('groupValue', 'N/A') | |
if group.get('doclist', {}).get('docs'): | |
doc = group.get('doclist', {}).get('docs', [{}])[0] | |
title = doc.get('abstract', ['No Title']) | |
content_list = doc.get('content', []) | |
content_snip = (' '.join(content_list[0].split()[:40]) + '...') if content_list else 'No content available.' | |
metric_val_raw = doc.get(plan.get('sort_field_for_examples'), 'N/A') | |
metric_val = metric_val_raw[0] if isinstance(metric_val_raw, list) else metric_val_raw | |
qualitative_prompt_str += f"- **For category `{group_value}`:**\n" | |
qualitative_prompt_str += f" - **Top Example Title:** {title}\n" | |
qualitative_prompt_str += f" - **Metric Value:** {metric_val}\n" | |
qualitative_prompt_str += f" - **Content Snippet:** {content_snip}\n\n" | |
return f""" | |
You are a top-tier business intelligence analyst. Your task is to write an insightful, data-driven report for an executive. You must synthesize quantitative data (the 'what') with qualitative examples (the 'why') to tell a complete story. | |
--- | |
### AVAILABLE INFORMATION | |
**1. The User's Core Question:** | |
\"{query}\" | |
**2. Quantitative Data (The 'What'):** | |
This data shows the high-level aggregates. | |
```json | |
{json.dumps(quantitative_data, indent=2)} | |
``` | |
**3. Qualitative Data (The 'Why'): | |
These are the single most significant documents driving the numbers for each category. | |
{qualitative_prompt_str} | |
--- | |
### REPORTING INSTRUCTIONS | |
Your report must be in clean, professional Markdown and follow this structure precisely. | |
**Report Structure:** | |
`## Executive Summary` | |
- A 1-2 sentence, top-line answer to the user's question based on the quantitative data. | |
`### Key Findings` | |
- Use bullet points to highlight the main figures from the quantitative data. Interpret the numbers. | |
`### Key Drivers & Illustrative Examples` | |
- **This is the most important section.** Explain the "so what?" behind the numbers. | |
- Use the qualitative examples to explain *why* a category is high or low. Reference the top example document for each main category. | |
`### Deeper Dive: Suggested Follow-up Analyses` | |
- Propose 2-3 logical next questions based on your analysis to uncover deeper trends. | |
--- | |
**Generate the full report now, paying close attention to all formatting and spacing rules.** | |
""" | |
def get_visualization_code_prompt(query_context, facet_data): | |
""" | |
Generates the prompt for creating Python visualization code. | |
""" | |
return f""" | |
You are a Python Data Visualization expert specializing in Matplotlib and Seaborn. | |
Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data. | |
**User's Analytical Goal:** | |
\"{query_context}\" | |
**Aggregated Data (from Solr Facets):** | |
```json | |
{json.dumps(facet_data, indent=2)} | |
``` | |
--- | |
### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES** | |
You MUST follow these rules to avoid errors. | |
**1. Identify the Data Structure FIRST:** | |
Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below. | |
* **Pattern A: Simple `terms` Facet.** The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts. | |
* **Pattern B: Multiple `query` Facets.** The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection"). | |
* **Pattern C: Nested `terms` Facet.** The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`. | |
**2. Use the Correct Parsing Template:** | |
--- | |
**TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):** | |
```python | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pandas as pd | |
plt.style.use('seaborn-v0_8-whitegrid') | |
fig, ax = plt.subplots(figsize=(12, 8)) | |
# Dynamically find the main facet key (the one with 'buckets') | |
facet_key = None | |
for key, value in facet_data.items(): | |
if isinstance(value, dict) and 'buckets' in value: | |
facet_key = key | |
break | |
if facet_key: | |
buckets = facet_data[facet_key].get('buckets', []) | |
# Check if buckets contain data | |
if buckets: | |
df = pd.DataFrame(buckets) | |
# Check for a nested metric or use 'count' | |
if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc): | |
# Example for nested sum metric | |
df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0)) | |
y_axis_label = 'Sum of Total Deal Value' | |
else: | |
df.rename(columns={{'count': 'value'}}, inplace=True) | |
y_axis_label = 'Count' | |
sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis') | |
ax.set_xlabel('Category') | |
ax.set_ylabel(y_axis_label) | |
else: | |
ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center') | |
ax.set_title('Your Insightful Title Here') | |
# Correct way to rotate labels to prevent errors | |
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") | |
plt.tight_layout() | |
``` | |
--- | |
**TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):** | |
```python | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pandas as pd | |
plt.style.use('seaborn-v0_8-whitegrid') | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
labels = [] | |
values = [] | |
# Iterate through top-level keys, skipping the 'count' | |
for key, data_dict in facet_data.items(): | |
if key == 'count' or not isinstance(data_dict, dict): | |
continue | |
# Extract the label (e.g., 'oral_deals' -> 'Oral') | |
label = key.replace('_deals', '').replace('_', ' ').title() | |
# Find the metric value, which is NOT 'count' | |
metric_value = 0 | |
for sub_key, sub_value in data_dict.items(): | |
if sub_key != 'count': | |
metric_value = sub_value | |
break # Found the metric | |
labels.append(label) | |
values.append(metric_value) | |
if labels: | |
sns.barplot(x=labels, y=values, ax=ax, palette='mako') | |
ax.set_ylabel('Total Deal Value') # Or other metric name | |
ax.set_xlabel('Category') | |
else: | |
ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center') | |
ax.set_title('Your Insightful Title Here') | |
plt.tight_layout() | |
``` | |
--- | |
**TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):** | |
```python | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pandas as pd | |
plt.style.use('seaborn-v0_8-whitegrid') | |
fig, ax = plt.subplots(figsize=(14, 8)) | |
# Find the key that has the buckets | |
facet_key = None | |
for key, value in facet_data.items(): | |
if isinstance(value, dict) and 'buckets' in value: | |
facet_key = key | |
break | |
if facet_key and facet_data[facet_key].get('buckets'): | |
# This list comprehension is robust for parsing nested metrics | |
plot_data = [] | |
for bucket in facet_data[facet_key]['buckets']: | |
category = bucket['val'] | |
# Find all nested metrics (e.g., total_deal_value_2025) | |
for sub_key, sub_value in bucket.items(): | |
if isinstance(sub_value, dict) and 'sum' in sub_value: | |
# Extracts year from 'total_deal_value_2025' -> '2025' | |
year = sub_key.split('_')[-1] | |
value = sub_value['sum'] | |
plot_data.append({{'Category': category, 'Year': year, 'Value': value}}) | |
if plot_data: | |
df = pd.DataFrame(plot_data) | |
sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax) | |
ax.set_ylabel('Total Deal Value') | |
ax.set_xlabel('Business Model') | |
# Correct way to rotate labels to prevent errors | |
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") | |
else: | |
ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center') | |
else: | |
ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center') | |
ax.set_title('Your Insightful Title Here') | |
plt.tight_layout() | |
``` | |
--- | |
**3. Final Code Generation:** | |
- **DO NOT** include `plt.show()`. | |
- **DO** set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`. | |
- **DO NOT** wrap the code in ```python ... ```. Output only the raw Python code. | |
- Adapt the chosen template to the specific keys and metrics in the provided `facet_data`. | |
**Your Task:** | |
Now, generate the Python code. | |
""" |