|
import gradio as gr |
|
import json |
|
import re |
|
import datetime |
|
import pandas as pd |
|
import pysolr |
|
import google.generativeai as genai |
|
from sshtunnel import SSHTunnelForwarder |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import io |
|
import os |
|
import logging |
|
import concurrent.futures |
|
from IPython.display import display, Markdown |
|
import copy |
|
|
|
|
|
|
|
logging.getLogger('matplotlib').setLevel(logging.WARNING) |
|
|
|
|
|
|
|
SSH_HOST = os.environ.get('SSH_HOST') |
|
SSH_PORT = 5322 |
|
SSH_USER = os.environ.get('SSH_USER') |
|
SSH_PASS = os.environ.get('SSH_PASS') |
|
|
|
|
|
REMOTE_SOLR_HOST = '69.167.186.48' |
|
REMOTE_SOLR_PORT = 8983 |
|
LOCAL_BIND_PORT = 8983 |
|
SOLR_CORE_NAME = 'news' |
|
SOLR_USER = os.environ.get('SOLR_USER') |
|
SOLR_PASS = os.environ.get('SOLR_PASS') |
|
|
|
|
|
try: |
|
genai.configure(api_key=os.environ.get('GEMINI_API_KEY')) |
|
except Exception as e: |
|
print(f"β Gemini API Key Error: {e}. Please ensure 'GEMINI_API_KEY' is set in your environment.") |
|
|
|
|
|
ssh_tunnel_server = None |
|
solr_client = None |
|
llm_model = None |
|
is_initialized = False |
|
|
|
try: |
|
|
|
ssh_tunnel_server = SSHTunnelForwarder( |
|
(SSH_HOST, SSH_PORT), |
|
ssh_username=SSH_USER, |
|
ssh_password=SSH_PASS, |
|
remote_bind_address=(REMOTE_SOLR_HOST, REMOTE_SOLR_PORT), |
|
local_bind_address=('127.0.0.1', LOCAL_BIND_PORT) |
|
) |
|
ssh_tunnel_server.start() |
|
print(f"π SSH tunnel established: Local Port {ssh_tunnel_server.local_bind_port} -> Remote Solr.") |
|
|
|
|
|
solr_url = f'http://127.0.0.1:{ssh_tunnel_server.local_bind_port}/solr/{SOLR_CORE_NAME}' |
|
solr_client = pysolr.Solr(solr_url, auth=(SOLR_USER, SOLR_PASS), always_commit=True) |
|
solr_client.ping() |
|
print(f"β
Solr connection successful on core '{SOLR_CORE_NAME}'.") |
|
|
|
|
|
llm_model = genai.GenerativeModel('gemini-2.5-flash', generation_config=genai.types.GenerationConfig(temperature=0)) |
|
print(f"β
LLM Model '{llm_model.model_name}' initialized.") |
|
|
|
print("β
System Initialized Successfully.") |
|
is_initialized = True |
|
|
|
except Exception as e: |
|
print(f"\nβ An error occurred during setup: {e}") |
|
if ssh_tunnel_server and ssh_tunnel_server.is_active: |
|
ssh_tunnel_server.stop() |
|
|
|
|
|
field_metadata = [ |
|
{ |
|
"field_name": "business_model", |
|
"type": "string (categorical)", |
|
"example_values": ["pharma/bio", "drug delivery", "pharma services"], |
|
"definition": "The primary business category of the company involved in the news. Use for filtering by high-level industry segments." |
|
}, |
|
{ |
|
"field_name": "news_type", |
|
"type": "string (categorical)", |
|
"example_values": ["product news", "financial news", "regulatory news"], |
|
"definition": "The category of the news article itself (e.g., financial, regulatory, acquisition). Use for filtering by the type of event being reported." |
|
}, |
|
{ |
|
"field_name": "event_type", |
|
"type": "string (categorical)", |
|
"example_values": ["phase 2", "phase 1", "pre clinical", "marketed"], |
|
"definition": "The clinical or developmental stage of a product or event discussed in the article. Essential for queries about clinical trial phases." |
|
}, |
|
{ |
|
"field_name": "source", |
|
"type": "string (categorical)", |
|
"example_values": ["Press Release", "PR Newswire", "Business Wire"], |
|
"definition": "The original source of the news article, such as a newswire or official report." |
|
}, |
|
{ |
|
"field_name": "company_name", |
|
"type": "string (exact match, for faceting)", |
|
"example_values": ["pfizer inc.", "astrazeneca plc", "roche"], |
|
"definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching." |
|
}, |
|
{ |
|
"field_name": "company_name_s", |
|
"type": "string (multi-valued, for searching)", |
|
"example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"], |
|
"definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting." |
|
}, |
|
{ |
|
"field_name": "territory_hq_s", |
|
"type": "string (multi-valued, hierarchical)", |
|
"example_values": ["united states of america", "europe", "europe western"], |
|
"definition": "The geographic location (country and continent) of a company's headquarters. It is hierarchical. Use for filtering by location." |
|
}, |
|
{ |
|
"field_name": "therapeutic_category", |
|
"type": "string (specific)", |
|
"example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"], |
|
"definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries." |
|
}, |
|
{ |
|
"field_name": "therapeutic_category_s", |
|
"type": "string (multi-valued, for searching)", |
|
"example_values": ["cancer", "oncology", "infections", "cns"], |
|
"definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter." |
|
}, |
|
{ |
|
"field_name": "compound_name", |
|
"type": "string (exact match, for faceting)", |
|
"example_values": ["opdivo injection solution", "keytruda injection solution"], |
|
"definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds." |
|
}, |
|
{ |
|
"field_name": "compound_name_s", |
|
"type": "string (multi-valued, for searching)", |
|
"example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"], |
|
"definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name." |
|
}, |
|
{ |
|
"field_name": "molecule_name", |
|
"type": "string (exact match, for faceting)", |
|
"example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"], |
|
"definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules." |
|
}, |
|
{ |
|
"field_name": "molecule_name_s", |
|
"type": "string (multi-valued, for searching)", |
|
"example_values": ["cbd", "s1-220", "a1002n5s"], |
|
"definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name." |
|
}, |
|
{ |
|
"field_name": "highest_phase", |
|
"type": "string (categorical)", |
|
"example_values": ["marketed", "phase 2", "phase 1"], |
|
"definition": "The highest stage of development a drug has ever reached." |
|
}, |
|
{ |
|
"field_name": "drug_delivery_branch_s", |
|
"type": "string (multi-valued, for searching)", |
|
"example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"], |
|
"definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms." |
|
}, |
|
{ |
|
"field_name": "drug_delivery_branch", |
|
"type": "string (categorical, specific, for faceting)", |
|
"example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"], |
|
"definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies." |
|
}, |
|
{ |
|
"field_name": "route_branch", |
|
"type": "string (categorical)", |
|
"example_values": ["injection", "oral", "topical", "inhalation"], |
|
"definition": "The primary route of drug administration. Good for faceting on exact routes." |
|
}, |
|
{ |
|
"field_name": "molecule_api_group", |
|
"type": "string (categorical)", |
|
"example_values": ["small molecules", "biologics", "nucleic acids"], |
|
"definition": "High-level classification of the drug's molecular type." |
|
}, |
|
{ |
|
"field_name": "content", |
|
"type": "text (full-text search)", |
|
"example_values": ["The largest study to date...", "balstilimab..."], |
|
"definition": "The full text content of the news article. Use for keyword searches on topics not covered by other specific fields." |
|
}, |
|
{ |
|
"field_name": "date", |
|
"type": "date", |
|
"example_values": ["2020-10-22T00:00:00Z"], |
|
"definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries." |
|
}, |
|
{ |
|
"field_name": "date_year", |
|
"type": "number (year)", |
|
"example_values": [2020, 2021, 2022], |
|
"definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')." |
|
}, |
|
{ |
|
"field_name": "total_deal_value_in_million", |
|
"type": "number (metric)", |
|
"example_values": [50, 120.5, 176.157, 1000], |
|
"definition": "The total value of a financial deal, in millions of USD. This is the primary numeric field for financial aggregations (sum, avg, etc.). To use this, you must also filter for news that has a deal value, e.g., 'total_deal_value_in_million:[0 TO *]'." |
|
} |
|
] |
|
|
|
|
|
def format_metadata_for_prompt(metadata): |
|
formatted_string = "" |
|
for field in metadata: |
|
formatted_string += f"- **{field['field_name']}**\n" |
|
formatted_string += f" - **Type**: {field['type']}\n" |
|
formatted_string += f" - **Definition**: {field['definition']}\n" |
|
formatted_string += f" - **Examples**: {', '.join(map(str, field['example_values']))}\n\n" |
|
return formatted_string |
|
formatted_field_info = format_metadata_for_prompt(field_metadata) |
|
|
|
|
|
def parse_suggestions_from_report(report_text): |
|
"""Extracts numbered suggestions from the report's markdown text.""" |
|
suggestions_match = re.search(r"### (?:Deeper Dive: Suggested Follow-up Analyses|Suggestions for Further Exploration)\s*\n(.*?)$", report_text, re.DOTALL | re.IGNORECASE) |
|
if not suggestions_match: return [] |
|
suggestions_text = suggestions_match.group(1) |
|
suggestions = re.findall(r"^\s*\d+\.\s*(.*)", suggestions_text, re.MULTILINE) |
|
return [s.strip() for s in suggestions] |
|
|
|
|
|
def llm_generate_analysis_plan_with_history(natural_language_query, field_metadata, chat_history): |
|
""" |
|
Generates a complete analysis plan from a user query, considering chat history. |
|
This plan includes dimensions, measures, and requests for both quantitative ( |
|
facet) |
|
and qualitative (grouping) data. |
|
""" |
|
formatted_history = "" |
|
for user_msg, bot_msg in chat_history: |
|
if user_msg: |
|
formatted_history += f"- User: \"{user_msg}\"\n" |
|
|
|
prompt = f""" |
|
You are an expert data analyst and Solr query engineer. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping). |
|
|
|
--- |
|
### CONTEXT & RULES |
|
|
|
1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")} |
|
2. **Field Usage**: You MUST use the fields described in the 'Field Definitions'. Pay close attention to the definitions to select the correct field, especially the `_s` fields for searching. Do not use fields ending with `_s` in `group.field` or facet `field` unless necessary for the analysis. |
|
3. **Dimension vs. Measure**: |
|
* `analysis_dimension`: The primary categorical field the user wants to group by (e.g., `company_name`, `route_branch`). This is the `group by` field. |
|
* `analysis_measure`: The metric to aggregate (e.g., `sum(total_deal_value_in_million)`) or the method of counting (`count`). |
|
* `sort_field_for_examples`: The raw field used to find the "best" example. If `analysis_measure` is `sum(field)`, this should be `field`. If `analysis_measure` is `count`, this should be a relevant field like `date`. |
|
4. **Crucial Sorting Rules**: |
|
* For `group.sort`: If `analysis_measure` involves a function on a field (e.g., `sum(total_deal_value_in_million)`), you MUST use the full function: `group.sort: 'sum(total_deal_value_in_million) desc'`. |
|
* If `analysis_measure` is 'count', you MUST OMIT the `group.sort` parameter entirely. |
|
* For sorting, NEVER use 'date_year'; use 'date' instead. |
|
5. **Output Format**: Your final output must be a single, raw JSON object. Do not add comments or markdown formatting. |
|
|
|
--- |
|
### FIELD DEFINITIONS (Your Source of Truth) |
|
|
|
{formatted_field_info} |
|
--- |
|
### CHAT HISTORY |
|
{formatted_history} |
|
--- |
|
### EXAMPLES |
|
|
|
**User Query 1:** "What are the top 5 companies by total deal value in 2023?" |
|
**Correct JSON Output 1:** |
|
```json |
|
{{ |
|
"analysis_dimension": "company_name", |
|
"analysis_measure": "sum(total_deal_value_in_million)", |
|
"sort_field_for_examples": "total_deal_value_in_million", |
|
"query_filter": "date_year:2023 AND total_deal_value_in_million:[0 TO *]", |
|
"quantitative_request": {{ |
|
"json.facet": {{ |
|
"companies_by_deal_value": {{ |
|
"type": "terms", |
|
"field": "company_name", |
|
"limit": 5, |
|
"sort": "total_value desc", |
|
"facet": {{ |
|
"total_value": "sum(total_deal_value_in_million)" |
|
}} |
|
}} |
|
}} |
|
}}, |
|
"qualitative_request": {{ |
|
"group": true, |
|
"group.field": "company_name", |
|
"group.limit": 1, |
|
"group.sort": "sum(total_deal_value_in_million) desc", |
|
"sort": "total_deal_value_in_million desc" |
|
}} |
|
}} |
|
``` |
|
|
|
**User Query 2:** "What are the most common news types for infections this year?" |
|
**Correct JSON Output 2:** |
|
```json |
|
{{ |
|
"analysis_dimension": "news_type", |
|
"analysis_measure": "count", |
|
"sort_field_for_examples": "date", |
|
"query_filter": "therapeutic_category_s:infections AND date_year:{datetime.datetime.now().year}", |
|
"quantitative_request": {{ |
|
"json.facet": {{ |
|
"news_by_type": {{ |
|
"type": "terms", |
|
"field": "news_type", |
|
"limit": 10, |
|
"sort": "count desc" |
|
}} |
|
}} |
|
}}, |
|
"qualitative_request": {{ |
|
"group": true, |
|
"group.field": "news_type", |
|
"group.limit": 1, |
|
"sort": "date desc" |
|
}} |
|
}} |
|
``` |
|
--- |
|
### YOUR TASK |
|
|
|
Convert the following user query into a single, raw JSON "Analysis Plan" object, strictly following all rules and considering the chat history. |
|
|
|
**Current User Query:** `{natural_language_query}` |
|
""" |
|
try: |
|
response = llm_model.generate_content(prompt) |
|
cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip() |
|
plan = json.loads(cleaned_text) |
|
return plan |
|
except Exception as e: |
|
raw_response_text = response.text if 'response' in locals() else 'N/A' |
|
print(f"Error in llm_generate_analysis_plan_with_history: {e}\nRaw Response:\n{raw_response_text}") |
|
return None |
|
|
|
def execute_quantitative_query(plan, solr): |
|
"""Executes the facet query to get aggregate data.""" |
|
if not plan or 'quantitative_request' not in plan or 'json.facet' not in plan.get('quantitative_request', {}): |
|
return None |
|
try: |
|
params = { |
|
"q": plan.get('query_filter', '*:*'), |
|
"rows": 0, |
|
"json.facet": json.dumps(plan['quantitative_request']['json.facet']) |
|
} |
|
results = solr.search(**params) |
|
return results.raw_response.get("facets", {}) |
|
except Exception as e: |
|
print(f"Error in quantitative query: {e}") |
|
return None |
|
|
|
def execute_qualitative_query(plan, solr): |
|
"""Executes the grouping query to get the best example docs.""" |
|
if not plan or 'qualitative_request' not in plan: |
|
return None |
|
try: |
|
qual_request = copy.deepcopy(plan['qualitative_request']) |
|
params = { |
|
"q": plan.get('query_filter', '*:*'), |
|
"rows": 3, |
|
"fl": "*,score", |
|
**qual_request |
|
} |
|
results = solr.search(**params) |
|
return results.grouped |
|
except Exception as e: |
|
print(f"Error in qualitative query: {e}") |
|
return None |
|
|
|
def llm_synthesize_enriched_report_stream(query, quantitative_data, qualitative_data, plan): |
|
""" |
|
Generates an enriched report by synthesizing quantitative aggregates |
|
and qualitative examples, and streams the result. |
|
""" |
|
qualitative_prompt_str = "" |
|
dimension = plan.get('analysis_dimension', 'N/A') |
|
if qualitative_data and dimension in qualitative_data: |
|
for group in qualitative_data.get(dimension, {}).get('groups', []): |
|
group_value = group.get('groupValue', 'N/A') |
|
if group.get('doclist', {}).get('docs'): |
|
doc = group.get('doclist', {}).get('docs', [{}])[0] |
|
title = doc.get('abstract', ['No Title']) |
|
content_list = doc.get('content', []) |
|
content_snip = (' '.join(content_list[0].split()[:40]) + '...') if content_list else 'No content available.' |
|
metric_val_raw = doc.get(plan.get('sort_field_for_examples'), 'N/A') |
|
metric_val = metric_val_raw[0] if isinstance(metric_val_raw, list) else metric_val_raw |
|
|
|
qualitative_prompt_str += f"- **For category `{group_value}`:**\n" |
|
qualitative_prompt_str += f" - **Top Example Title:** {title}\n" |
|
qualitative_prompt_str += f" - **Metric Value:** {metric_val}\n" |
|
qualitative_prompt_str += f" - **Content Snippet:** {content_snip}\n\n" |
|
|
|
prompt = f""" |
|
You are a top-tier business intelligence analyst. Your task is to write an insightful, data-driven report for an executive. You must synthesize quantitative data (the 'what') with qualitative examples (the 'why') to tell a complete story. |
|
|
|
--- |
|
### AVAILABLE INFORMATION |
|
|
|
**1. The User's Core Question:** |
|
\"{query}\" |
|
|
|
**2. Quantitative Data (The 'What'):** |
|
This data shows the high-level aggregates. |
|
```json |
|
{json.dumps(quantitative_data, indent=2)} |
|
``` |
|
|
|
**3. Qualitative Data (The 'Why'):** |
|
These are the single most significant documents driving the numbers for each category. |
|
{qualitative_prompt_str} |
|
|
|
--- |
|
### REPORTING INSTRUCTIONS |
|
|
|
Your report must be in clean, professional Markdown and follow this structure precisely. |
|
|
|
**Report Structure:** |
|
|
|
`## Executive Summary` |
|
- A 1-2 sentence, top-line answer to the user's question based on the quantitative data. |
|
|
|
`### Key Findings` |
|
- Use bullet points to highlight the main figures from the quantitative data. Interpret the numbers. |
|
|
|
`### Key Drivers & Illustrative Examples` |
|
- **This is the most important section.** Explain the "so what?" behind the numbers. |
|
- Use the qualitative examples to explain *why* a category is high or low. Reference the top example document for each main category. |
|
|
|
`### Deeper Dive: Suggested Follow-up Analyses` |
|
- Propose 2-3 logical next questions based on your analysis to uncover deeper trends. |
|
|
|
--- |
|
**Generate the full report now, paying close attention to all formatting and spacing rules.** |
|
""" |
|
try: |
|
response_stream = llm_model.generate_content(prompt, stream=True) |
|
for chunk in response_stream: |
|
yield chunk.text |
|
except Exception as e: |
|
print(f"Error in llm_synthesize_enriched_report_stream: {e}") |
|
yield "Sorry, I was unable to generate a report for this data." |
|
|
|
|
|
def llm_generate_visualization_code(query_context, facet_data): |
|
"""Generates Python code for visualization based on query and data.""" |
|
prompt = f""" |
|
You are a Python Data Visualization expert specializing in Matplotlib and Seaborn. |
|
Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data. |
|
|
|
**User's Analytical Goal:** |
|
\"{query_context}\" |
|
|
|
**Aggregated Data (from Solr Facets):** |
|
```json |
|
{json.dumps(facet_data, indent=2)} |
|
``` |
|
|
|
--- |
|
### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES** |
|
You MUST follow these rules to avoid errors. |
|
|
|
**1. Identify the Data Structure FIRST:** |
|
Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below. |
|
|
|
* **Pattern A: Simple `terms` Facet.** The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts. |
|
* **Pattern B: Multiple `query` Facets.** The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection"). |
|
* **Pattern C: Nested `terms` Facet.** The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`. |
|
|
|
**2. Use the Correct Parsing Template:** |
|
|
|
--- |
|
**TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):** |
|
```python |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import pandas as pd |
|
|
|
plt.style.use('seaborn-v0_8-whitegrid') |
|
fig, ax = plt.subplots(figsize=(12, 8)) |
|
|
|
# Dynamically find the main facet key (the one with 'buckets') |
|
facet_key = None |
|
for key, value in facet_data.items(): |
|
if isinstance(value, dict) and 'buckets' in value: |
|
facet_key = key |
|
break |
|
|
|
if facet_key: |
|
buckets = facet_data[facet_key].get('buckets', []) |
|
# Check if buckets contain data |
|
if buckets: |
|
df = pd.DataFrame(buckets) |
|
# Check for a nested metric or use 'count' |
|
if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc): |
|
# Example for nested sum metric |
|
df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0)) |
|
y_axis_label = 'Sum of Total Deal Value' |
|
else: |
|
df.rename(columns={{'count': 'value'}}, inplace=True) |
|
y_axis_label = 'Count' |
|
|
|
sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis') |
|
ax.set_xlabel('Category') |
|
ax.set_ylabel(y_axis_label) |
|
else: |
|
ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center') |
|
|
|
|
|
ax.set_title('Your Insightful Title Here') |
|
# Correct way to rotate labels to prevent errors |
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") |
|
plt.tight_layout() |
|
``` |
|
--- |
|
**TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):** |
|
```python |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import pandas as pd |
|
|
|
plt.style.use('seaborn-v0_8-whitegrid') |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
|
labels = [] |
|
values = [] |
|
# Iterate through top-level keys, skipping the 'count' |
|
for key, data_dict in facet_data.items(): |
|
if key == 'count' or not isinstance(data_dict, dict): |
|
continue |
|
# Extract the label (e.g., 'oral_deals' -> 'Oral') |
|
label = key.replace('_deals', '').replace('_', ' ').title() |
|
# Find the metric value, which is NOT 'count' |
|
metric_value = 0 |
|
for sub_key, sub_value in data_dict.items(): |
|
if sub_key != 'count': |
|
metric_value = sub_value |
|
break # Found the metric |
|
labels.append(label) |
|
values.append(metric_value) |
|
|
|
if labels: |
|
sns.barplot(x=labels, y=values, ax=ax, palette='mako') |
|
ax.set_ylabel('Total Deal Value') # Or other metric name |
|
ax.set_xlabel('Category') |
|
else: |
|
ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center') |
|
|
|
|
|
ax.set_title('Your Insightful Title Here') |
|
plt.tight_layout() |
|
``` |
|
--- |
|
**TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):** |
|
```python |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import pandas as pd |
|
|
|
plt.style.use('seaborn-v0_8-whitegrid') |
|
fig, ax = plt.subplots(figsize=(14, 8)) |
|
|
|
# Find the key that has the buckets |
|
facet_key = None |
|
for key, value in facet_data.items(): |
|
if isinstance(value, dict) and 'buckets' in value: |
|
facet_key = key |
|
break |
|
|
|
if facet_key and facet_data[facet_key].get('buckets'): |
|
# This list comprehension is robust for parsing nested metrics |
|
plot_data = [] |
|
for bucket in facet_data[facet_key]['buckets']: |
|
category = bucket['val'] |
|
# Find all nested metrics (e.g., total_deal_value_2025) |
|
for sub_key, sub_value in bucket.items(): |
|
if isinstance(sub_value, dict) and 'sum' in sub_value: |
|
# Extracts year from 'total_deal_value_2025' -> '2025' |
|
year = sub_key.split('_')[-1] |
|
value = sub_value['sum'] |
|
plot_data.append({{'Category': category, 'Year': year, 'Value': value}}) |
|
|
|
if plot_data: |
|
df = pd.DataFrame(plot_data) |
|
sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax) |
|
ax.set_ylabel('Total Deal Value') |
|
ax.set_xlabel('Business Model') |
|
# Correct way to rotate labels to prevent errors |
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") |
|
else: |
|
ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center') |
|
else: |
|
ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center') |
|
|
|
ax.set_title('Your Insightful Title Here') |
|
plt.tight_layout() |
|
``` |
|
--- |
|
**3. Final Code Generation:** |
|
- **DO NOT** include `plt.show()`. |
|
- **DO** set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`. |
|
- **DO NOT** wrap the code in ```python ... ```. Output only the raw Python code. |
|
- Adapt the chosen template to the specific keys and metrics in the provided `facet_data`. |
|
|
|
**Your Task:** |
|
Now, generate the Python code. |
|
""" |
|
try: |
|
|
|
generation_config = genai.types.GenerationConfig(temperature=0, max_output_tokens=2048) |
|
response = llm_model.generate_content(prompt, generation_config=generation_config) |
|
|
|
code = re.sub(r'^```python\s*|```$', '', response.text, flags=re.MULTILINE) |
|
return code |
|
except Exception as e: |
|
print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {response.text}") |
|
return None |
|
|
|
def execute_viz_code_and_get_path(viz_code, facet_data): |
|
"""Executes visualization code and returns the path to the saved plot image.""" |
|
if not viz_code: return None |
|
try: |
|
if not os.path.exists('/tmp/plots'): os.makedirs('/tmp/plots') |
|
plot_path = f"/tmp/plots/plot_{datetime.datetime.now().timestamp()}.png" |
|
|
|
exec_globals = {'facet_data': facet_data, 'plt': plt, 'sns': sns, 'pd': pd} |
|
exec(viz_code, exec_globals) |
|
fig = exec_globals.get('fig') |
|
if fig: |
|
fig.savefig(plot_path, bbox_inches='tight') |
|
plt.close(fig) |
|
return plot_path |
|
return None |
|
except Exception as e: |
|
print(f"ERROR executing visualization code: {e}\n---Code---\n{viz_code}") |
|
return None |
|
|
|
|
|
def process_analysis_flow(user_input, history, state): |
|
""" |
|
A generator that manages the conversation and yields tuples of UI updates for Gradio. |
|
This version uses the dual-query (quantitative/qualitative) approach. |
|
""" |
|
if state is None: |
|
state = {'query_count': 0, 'last_suggestions': []} |
|
if history is None: |
|
history = [] |
|
|
|
|
|
yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False)) |
|
|
|
query_context = user_input.strip() |
|
if not query_context: |
|
history.append((user_input, "Please enter a question to analyze.")) |
|
yield (history, state, None, None, None, None, None) |
|
return |
|
|
|
|
|
history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating analysis plan...*")) |
|
yield (history, state, None, None, None, None, None) |
|
|
|
analysis_plan = llm_generate_analysis_plan_with_history(query_context, field_metadata, history) |
|
if not analysis_plan: |
|
history.append((None, "I'm sorry, I couldn't generate a valid analysis plan for that request. Please try rephrasing.")) |
|
yield (history, state, None, None, None, None, None) |
|
return |
|
|
|
history.append((None, "β
Analysis plan generated!")) |
|
plan_summary = f""" |
|
* **Analysis Dimension:** `{analysis_plan.get('analysis_dimension')}` |
|
* **Analysis Measure:** `{analysis_plan.get('analysis_measure')}` |
|
* **Query Filter:** `{analysis_plan.get('query_filter')}` |
|
""" |
|
|
|
history.append((None, plan_summary)) |
|
|
|
formatted_plan = f"**Full Analysis Plan:**\n```json\n{json.dumps(analysis_plan, indent=2)}\n```" |
|
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None) |
|
|
|
|
|
|
|
history.append((None, "*Executing queries for aggregates and examples...*")) |
|
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None) |
|
|
|
aggregate_data = None |
|
example_data = None |
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
future_agg = executor.submit(execute_quantitative_query, analysis_plan, solr_client) |
|
future_ex = executor.submit(execute_qualitative_query, analysis_plan, solr_client) |
|
aggregate_data = future_agg.result() |
|
example_data = future_ex.result() |
|
|
|
if not aggregate_data or aggregate_data.get('count', 0) == 0: |
|
history.append((None, "No data was found for your query. Please try a different question.")) |
|
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None) |
|
return |
|
|
|
|
|
formatted_agg_data = f"**Quantitative (Aggregate) Data:**\n```json\n{json.dumps(aggregate_data, indent=2)}\n```" |
|
formatted_qual_data = f"**Qualitative (Example) Data:**\n```json\n{json.dumps(example_data, indent=2)}\n```" |
|
qual_data_display_update = gr.update(value=formatted_qual_data, visible=True) |
|
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update) |
|
|
|
|
|
|
|
history.append((None, "β
Data retrieved. Generating visualization and final report...")) |
|
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update) |
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
viz_future = executor.submit(llm_generate_visualization_code, query_context, aggregate_data) |
|
|
|
|
|
report_text = "" |
|
stream_history = history[:] |
|
for chunk in llm_synthesize_enriched_report_stream(query_context, aggregate_data, example_data, analysis_plan): |
|
report_text += chunk |
|
yield (stream_history, state, None, gr.update(value=report_text, visible=True), gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update) |
|
|
|
history.append((None, report_text)) |
|
|
|
|
|
viz_code = viz_future.result() |
|
plot_path = execute_viz_code_and_get_path(viz_code, aggregate_data) |
|
output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False) |
|
if not plot_path: |
|
history.append((None, "*I was unable to generate a plot for this data.*\n")) |
|
|
|
yield (history, state, output_plot, report_text, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update) |
|
|
|
|
|
state['query_count'] += 1 |
|
state['last_suggestions'] = parse_suggestions_from_report(report_text) |
|
next_prompt = "Analysis complete. What would you like to explore next?" |
|
history.append((None, next_prompt)) |
|
yield (history, state, output_plot, report_text, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update) |
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), css="footer {display: none !important}") as demo: |
|
state = gr.State() |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=4): |
|
gr.Markdown("# π PharmaCircle AI Data Analyst") |
|
with gr.Column(scale=1): |
|
clear_button = gr.Button("π Start New Analysis", variant="primary") |
|
|
|
gr.Markdown("Ask a question to begin your analysis. I will generate an analysis plan, retrieve quantitative and qualitative data, create a visualization, and write an enriched report.") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
chatbot = gr.Chatbot(label="Analysis Chat Log", height=700, show_copy_button=True) |
|
msg_textbox = gr.Textbox(placeholder="Ask a question, e.g., 'Show me the top 5 companies by total deal value in 2023'", label="Your Question", interactive=True) |
|
|
|
with gr.Column(scale=2): |
|
with gr.Accordion("Generated Analysis Plan", open=False): |
|
plan_display = gr.Markdown("Plan will appear here...", visible=True) |
|
with gr.Accordion("Retrieved Quantitative Data", open=False): |
|
quantitative_data_display = gr.Markdown("Aggregate data will appear here...", visible=False) |
|
with gr.Accordion("Retrieved Qualitative Data (Examples)", open=False): |
|
qualitative_data_display = gr.Markdown("Example data will appear here...", visible=False) |
|
plot_display = gr.Image(label="Visualization", type="filepath", visible=False) |
|
report_display = gr.Markdown("Report will be streamed here...", visible=False) |
|
|
|
|
|
def reset_all(): |
|
"""Resets the entire UI for a new analysis session.""" |
|
return ( |
|
[], |
|
None, |
|
"", |
|
gr.update(value=None, visible=False), |
|
gr.update(value=None, visible=False), |
|
gr.update(value=None, visible=False), |
|
gr.update(value=None, visible=False), |
|
gr.update(value=None, visible=False) |
|
) |
|
|
|
msg_textbox.submit( |
|
fn=process_analysis_flow, |
|
inputs=[msg_textbox, chatbot, state], |
|
outputs=[chatbot, state, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display], |
|
).then( |
|
lambda: gr.update(value=""), |
|
None, |
|
[msg_textbox], |
|
queue=False, |
|
) |
|
|
|
clear_button.click( |
|
fn=reset_all, |
|
inputs=None, |
|
outputs=[chatbot, state, msg_textbox, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display], |
|
queue=False |
|
) |
|
|
|
if is_initialized: |
|
demo.queue().launch(debug=True, share=True) |
|
else: |
|
print("\nSkipping Gradio launch due to initialization errors.") |
|
|