dolphinium commited on
Commit
4d74be9
Β·
verified Β·
1 Parent(s): 42a7bc0

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +813 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,813 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import re
4
+ import datetime
5
+ import pandas as pd
6
+ import pysolr
7
+ import google.generativeai as genai
8
+ from sshtunnel import SSHTunnelForwarder
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+ import io
12
+ import os
13
+ import logging
14
+ import concurrent.futures
15
+ from IPython.display import display, Markdown
16
+ import copy
17
+
18
+
19
+ # --- Suppress Matplotlib Debug Logs ---
20
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
21
+
22
+ # --- SSH Tunnel Configuration ---
23
+ # It's recommended to load secrets securely, e.g., from environment variables
24
+ SSH_HOST = os.environ.get('SSH_HOST')
25
+ SSH_PORT = 5322
26
+ SSH_USER = os.environ.get('SSH_USER')
27
+ SSH_PASS = os.environ.get('SSH_PASS')
28
+
29
+ # --- Solr Configuration ---
30
+ REMOTE_SOLR_HOST = '69.167.186.48'
31
+ REMOTE_SOLR_PORT = 8983
32
+ LOCAL_BIND_PORT = 8983
33
+ SOLR_CORE_NAME = 'news'
34
+ SOLR_USER = os.environ.get('SOLR_USER')
35
+ SOLR_PASS = os.environ.get('SOLR_PASS')
36
+
37
+ # --- Google Gemini Configuration ---
38
+ try:
39
+ genai.configure(api_key=os.environ.get('GEMINI_API_KEY'))
40
+ except Exception as e:
41
+ print(f"❌ Gemini API Key Error: {e}. Please ensure 'GEMINI_API_KEY' is set in your environment.")
42
+
43
+ # --- Global Variables ---
44
+ ssh_tunnel_server = None
45
+ solr_client = None
46
+ llm_model = None
47
+ is_initialized = False
48
+
49
+ try:
50
+ # 1. Start the SSH Tunnel
51
+ ssh_tunnel_server = SSHTunnelForwarder(
52
+ (SSH_HOST, SSH_PORT),
53
+ ssh_username=SSH_USER,
54
+ ssh_password=SSH_PASS,
55
+ remote_bind_address=(REMOTE_SOLR_HOST, REMOTE_SOLR_PORT),
56
+ local_bind_address=('127.0.0.1', LOCAL_BIND_PORT)
57
+ )
58
+ ssh_tunnel_server.start()
59
+ print(f"πŸš€ SSH tunnel established: Local Port {ssh_tunnel_server.local_bind_port} -> Remote Solr.")
60
+
61
+ # 2. Initialize the pysolr client
62
+ solr_url = f'http://127.0.0.1:{ssh_tunnel_server.local_bind_port}/solr/{SOLR_CORE_NAME}'
63
+ solr_client = pysolr.Solr(solr_url, auth=(SOLR_USER, SOLR_PASS), always_commit=True)
64
+ solr_client.ping()
65
+ print(f"βœ… Solr connection successful on core '{SOLR_CORE_NAME}'.")
66
+
67
+ # 3. Initialize the LLM
68
+ llm_model = genai.GenerativeModel('gemini-2.5-flash', generation_config=genai.types.GenerationConfig(temperature=0))
69
+ print(f"βœ… LLM Model '{llm_model.model_name}' initialized.")
70
+
71
+ print("βœ… System Initialized Successfully.")
72
+ is_initialized = True
73
+
74
+ except Exception as e:
75
+ print(f"\n❌ An error occurred during setup: {e}")
76
+ if ssh_tunnel_server and ssh_tunnel_server.is_active:
77
+ ssh_tunnel_server.stop()
78
+
79
+
80
+ field_metadata = [
81
+ {
82
+ "field_name": "business_model",
83
+ "type": "string (categorical)",
84
+ "example_values": ["pharma/bio", "drug delivery", "pharma services"],
85
+ "definition": "The primary business category of the company involved in the news. Use for filtering by high-level industry segments."
86
+ },
87
+ {
88
+ "field_name": "news_type",
89
+ "type": "string (categorical)",
90
+ "example_values": ["product news", "financial news", "regulatory news"],
91
+ "definition": "The category of the news article itself (e.g., financial, regulatory, acquisition). Use for filtering by the type of event being reported."
92
+ },
93
+ {
94
+ "field_name": "event_type",
95
+ "type": "string (categorical)",
96
+ "example_values": ["phase 2", "phase 1", "pre clinical", "marketed"],
97
+ "definition": "The clinical or developmental stage of a product or event discussed in the article. Essential for queries about clinical trial phases."
98
+ },
99
+ {
100
+ "field_name": "source",
101
+ "type": "string (categorical)",
102
+ "example_values": ["Press Release", "PR Newswire", "Business Wire"],
103
+ "definition": "The original source of the news article, such as a newswire or official report."
104
+ },
105
+ {
106
+ "field_name": "company_name",
107
+ "type": "string (exact match, for faceting)",
108
+ "example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
109
+ "definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching."
110
+ },
111
+ {
112
+ "field_name": "company_name_s",
113
+ "type": "string (multi-valued, for searching)",
114
+ "example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
115
+ "definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting."
116
+ },
117
+ {
118
+ "field_name": "territory_hq_s",
119
+ "type": "string (multi-valued, hierarchical)",
120
+ "example_values": ["united states of america", "europe", "europe western"],
121
+ "definition": "The geographic location (country and continent) of a company's headquarters. It is hierarchical. Use for filtering by location."
122
+ },
123
+ {
124
+ "field_name": "therapeutic_category",
125
+ "type": "string (specific)",
126
+ "example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
127
+ "definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries."
128
+ },
129
+ {
130
+ "field_name": "therapeutic_category_s",
131
+ "type": "string (multi-valued, for searching)",
132
+ "example_values": ["cancer", "oncology", "infections", "cns"],
133
+ "definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter."
134
+ },
135
+ {
136
+ "field_name": "compound_name",
137
+ "type": "string (exact match, for faceting)",
138
+ "example_values": ["opdivo injection solution", "keytruda injection solution"],
139
+ "definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds."
140
+ },
141
+ {
142
+ "field_name": "compound_name_s",
143
+ "type": "string (multi-valued, for searching)",
144
+ "example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
145
+ "definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name."
146
+ },
147
+ {
148
+ "field_name": "molecule_name",
149
+ "type": "string (exact match, for faceting)",
150
+ "example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
151
+ "definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules."
152
+ },
153
+ {
154
+ "field_name": "molecule_name_s",
155
+ "type": "string (multi-valued, for searching)",
156
+ "example_values": ["cbd", "s1-220", "a1002n5s"],
157
+ "definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name."
158
+ },
159
+ {
160
+ "field_name": "highest_phase",
161
+ "type": "string (categorical)",
162
+ "example_values": ["marketed", "phase 2", "phase 1"],
163
+ "definition": "The highest stage of development a drug has ever reached."
164
+ },
165
+ {
166
+ "field_name": "drug_delivery_branch_s",
167
+ "type": "string (multi-valued, for searching)",
168
+ "example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
169
+ "definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms."
170
+ },
171
+ {
172
+ "field_name": "drug_delivery_branch",
173
+ "type": "string (categorical, specific, for faceting)",
174
+ "example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
175
+ "definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies."
176
+ },
177
+ {
178
+ "field_name": "route_branch",
179
+ "type": "string (categorical)",
180
+ "example_values": ["injection", "oral", "topical", "inhalation"],
181
+ "definition": "The primary route of drug administration. Good for faceting on exact routes."
182
+ },
183
+ {
184
+ "field_name": "molecule_api_group",
185
+ "type": "string (categorical)",
186
+ "example_values": ["small molecules", "biologics", "nucleic acids"],
187
+ "definition": "High-level classification of the drug's molecular type."
188
+ },
189
+ {
190
+ "field_name": "content",
191
+ "type": "text (full-text search)",
192
+ "example_values": ["The largest study to date...", "balstilimab..."],
193
+ "definition": "The full text content of the news article. Use for keyword searches on topics not covered by other specific fields."
194
+ },
195
+ {
196
+ "field_name": "date",
197
+ "type": "date",
198
+ "example_values": ["2020-10-22T00:00:00Z"],
199
+ "definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries."
200
+ },
201
+ {
202
+ "field_name": "date_year",
203
+ "type": "number (year)",
204
+ "example_values": [2020, 2021, 2022],
205
+ "definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')."
206
+ },
207
+ {
208
+ "field_name": "total_deal_value_in_million",
209
+ "type": "number (metric)",
210
+ "example_values": [50, 120.5, 176.157, 1000],
211
+ "definition": "The total value of a financial deal, in millions of USD. This is the primary numeric field for financial aggregations (sum, avg, etc.). To use this, you must also filter for news that has a deal value, e.g., 'total_deal_value_in_million:[0 TO *]'."
212
+ }
213
+ ]
214
+
215
+ # Helper function to format the metadata for the prompt
216
+ def format_metadata_for_prompt(metadata):
217
+ formatted_string = ""
218
+ for field in metadata:
219
+ formatted_string += f"- **{field['field_name']}**\n"
220
+ formatted_string += f" - **Type**: {field['type']}\n"
221
+ formatted_string += f" - **Definition**: {field['definition']}\n"
222
+ formatted_string += f" - **Examples**: {', '.join(map(str, field['example_values']))}\n\n"
223
+ return formatted_string
224
+ formatted_field_info = format_metadata_for_prompt(field_metadata)
225
+
226
+
227
+ def parse_suggestions_from_report(report_text):
228
+ """Extracts numbered suggestions from the report's markdown text."""
229
+ suggestions_match = re.search(r"### (?:Deeper Dive: Suggested Follow-up Analyses|Suggestions for Further Exploration)\s*\n(.*?)$", report_text, re.DOTALL | re.IGNORECASE)
230
+ if not suggestions_match: return []
231
+ suggestions_text = suggestions_match.group(1)
232
+ suggestions = re.findall(r"^\s*\d+\.\s*(.*)", suggestions_text, re.MULTILINE)
233
+ return [s.strip() for s in suggestions]
234
+
235
+
236
+ def llm_generate_analysis_plan_with_history(natural_language_query, field_metadata, chat_history):
237
+ """
238
+ Generates a complete analysis plan from a user query, considering chat history.
239
+ This plan includes dimensions, measures, and requests for both quantitative (
240
+ facet)
241
+ and qualitative (grouping) data.
242
+ """
243
+ formatted_history = ""
244
+ for user_msg, bot_msg in chat_history:
245
+ if user_msg:
246
+ formatted_history += f"- User: \"{user_msg}\"\n"
247
+
248
+ prompt = f"""
249
+ You are an expert data analyst and Solr query engineer. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping).
250
+
251
+ ---
252
+ ### CONTEXT & RULES
253
+
254
+ 1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")}
255
+ 2. **Field Usage**: You MUST use the fields described in the 'Field Definitions'. Pay close attention to the definitions to select the correct field, especially the `_s` fields for searching. Do not use fields ending with `_s` in `group.field` or facet `field` unless necessary for the analysis.
256
+ 3. **Dimension vs. Measure**:
257
+ * `analysis_dimension`: The primary categorical field the user wants to group by (e.g., `company_name`, `route_branch`). This is the `group by` field.
258
+ * `analysis_measure`: The metric to aggregate (e.g., `sum(total_deal_value_in_million)`) or the method of counting (`count`).
259
+ * `sort_field_for_examples`: The raw field used to find the "best" example. If `analysis_measure` is `sum(field)`, this should be `field`. If `analysis_measure` is `count`, this should be a relevant field like `date`.
260
+ 4. **Crucial Sorting Rules**:
261
+ * For `group.sort`: If `analysis_measure` involves a function on a field (e.g., `sum(total_deal_value_in_million)`), you MUST use the full function: `group.sort: 'sum(total_deal_value_in_million) desc'`.
262
+ * If `analysis_measure` is 'count', you MUST OMIT the `group.sort` parameter entirely.
263
+ * For sorting, NEVER use 'date_year'; use 'date' instead.
264
+ 5. **Output Format**: Your final output must be a single, raw JSON object. Do not add comments or markdown formatting.
265
+
266
+ ---
267
+ ### FIELD DEFINITIONS (Your Source of Truth)
268
+
269
+ {formatted_field_info}
270
+ ---
271
+ ### CHAT HISTORY
272
+ {formatted_history}
273
+ ---
274
+ ### EXAMPLES
275
+
276
+ **User Query 1:** "What are the top 5 companies by total deal value in 2023?"
277
+ **Correct JSON Output 1:**
278
+ ```json
279
+ {{
280
+ "analysis_dimension": "company_name",
281
+ "analysis_measure": "sum(total_deal_value_in_million)",
282
+ "sort_field_for_examples": "total_deal_value_in_million",
283
+ "query_filter": "date_year:2023 AND total_deal_value_in_million:[0 TO *]",
284
+ "quantitative_request": {{
285
+ "json.facet": {{
286
+ "companies_by_deal_value": {{
287
+ "type": "terms",
288
+ "field": "company_name",
289
+ "limit": 5,
290
+ "sort": "total_value desc",
291
+ "facet": {{
292
+ "total_value": "sum(total_deal_value_in_million)"
293
+ }}
294
+ }}
295
+ }}
296
+ }},
297
+ "qualitative_request": {{
298
+ "group": true,
299
+ "group.field": "company_name",
300
+ "group.limit": 1,
301
+ "group.sort": "sum(total_deal_value_in_million) desc",
302
+ "sort": "total_deal_value_in_million desc"
303
+ }}
304
+ }}
305
+ ```
306
+
307
+ **User Query 2:** "What are the most common news types for infections this year?"
308
+ **Correct JSON Output 2:**
309
+ ```json
310
+ {{
311
+ "analysis_dimension": "news_type",
312
+ "analysis_measure": "count",
313
+ "sort_field_for_examples": "date",
314
+ "query_filter": "therapeutic_category_s:infections AND date_year:{datetime.datetime.now().year}",
315
+ "quantitative_request": {{
316
+ "json.facet": {{
317
+ "news_by_type": {{
318
+ "type": "terms",
319
+ "field": "news_type",
320
+ "limit": 10,
321
+ "sort": "count desc"
322
+ }}
323
+ }}
324
+ }},
325
+ "qualitative_request": {{
326
+ "group": true,
327
+ "group.field": "news_type",
328
+ "group.limit": 1,
329
+ "sort": "date desc"
330
+ }}
331
+ }}
332
+ ```
333
+ ---
334
+ ### YOUR TASK
335
+
336
+ Convert the following user query into a single, raw JSON "Analysis Plan" object, strictly following all rules and considering the chat history.
337
+
338
+ **Current User Query:** `{natural_language_query}`
339
+ """
340
+ try:
341
+ response = llm_model.generate_content(prompt)
342
+ cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip()
343
+ plan = json.loads(cleaned_text)
344
+ return plan
345
+ except Exception as e:
346
+ raw_response_text = response.text if 'response' in locals() else 'N/A'
347
+ print(f"Error in llm_generate_analysis_plan_with_history: {e}\nRaw Response:\n{raw_response_text}")
348
+ return None
349
+
350
+ def execute_quantitative_query(plan, solr):
351
+ """Executes the facet query to get aggregate data."""
352
+ if not plan or 'quantitative_request' not in plan or 'json.facet' not in plan.get('quantitative_request', {}):
353
+ return None
354
+ try:
355
+ params = {
356
+ "q": plan.get('query_filter', '*:*'),
357
+ "rows": 0,
358
+ "json.facet": json.dumps(plan['quantitative_request']['json.facet'])
359
+ }
360
+ results = solr.search(**params)
361
+ return results.raw_response.get("facets", {})
362
+ except Exception as e:
363
+ print(f"Error in quantitative query: {e}")
364
+ return None
365
+
366
+ def execute_qualitative_query(plan, solr):
367
+ """Executes the grouping query to get the best example docs."""
368
+ if not plan or 'qualitative_request' not in plan:
369
+ return None
370
+ try:
371
+ qual_request = copy.deepcopy(plan['qualitative_request'])
372
+ params = {
373
+ "q": plan.get('query_filter', '*:*'),
374
+ "rows": 3, # Get a few examples per group
375
+ "fl": "*,score",
376
+ **qual_request
377
+ }
378
+ results = solr.search(**params)
379
+ return results.grouped
380
+ except Exception as e:
381
+ print(f"Error in qualitative query: {e}")
382
+ return None
383
+
384
+ def llm_synthesize_enriched_report_stream(query, quantitative_data, qualitative_data, plan):
385
+ """
386
+ Generates an enriched report by synthesizing quantitative aggregates
387
+ and qualitative examples, and streams the result.
388
+ """
389
+ qualitative_prompt_str = ""
390
+ dimension = plan.get('analysis_dimension', 'N/A')
391
+ if qualitative_data and dimension in qualitative_data:
392
+ for group in qualitative_data.get(dimension, {}).get('groups', []):
393
+ group_value = group.get('groupValue', 'N/A')
394
+ if group.get('doclist', {}).get('docs'):
395
+ doc = group.get('doclist', {}).get('docs', [{}])[0]
396
+ title = doc.get('abstract', ['No Title'])
397
+ content_list = doc.get('content', [])
398
+ content_snip = (' '.join(content_list[0].split()[:40]) + '...') if content_list else 'No content available.'
399
+ metric_val_raw = doc.get(plan.get('sort_field_for_examples'), 'N/A')
400
+ metric_val = metric_val_raw[0] if isinstance(metric_val_raw, list) else metric_val_raw
401
+
402
+ qualitative_prompt_str += f"- **For category `{group_value}`:**\n"
403
+ qualitative_prompt_str += f" - **Top Example Title:** {title}\n"
404
+ qualitative_prompt_str += f" - **Metric Value:** {metric_val}\n"
405
+ qualitative_prompt_str += f" - **Content Snippet:** {content_snip}\n\n"
406
+
407
+ prompt = f"""
408
+ You are a top-tier business intelligence analyst. Your task is to write an insightful, data-driven report for an executive. You must synthesize quantitative data (the 'what') with qualitative examples (the 'why') to tell a complete story.
409
+
410
+ ---
411
+ ### AVAILABLE INFORMATION
412
+
413
+ **1. The User's Core Question:**
414
+ \"{query}\"
415
+
416
+ **2. Quantitative Data (The 'What'):**
417
+ This data shows the high-level aggregates.
418
+ ```json
419
+ {json.dumps(quantitative_data, indent=2)}
420
+ ```
421
+
422
+ **3. Qualitative Data (The 'Why'):**
423
+ These are the single most significant documents driving the numbers for each category.
424
+ {qualitative_prompt_str}
425
+
426
+ ---
427
+ ### REPORTING INSTRUCTIONS
428
+
429
+ Your report must be in clean, professional Markdown and follow this structure precisely.
430
+
431
+ **Report Structure:**
432
+
433
+ `## Executive Summary`
434
+ - A 1-2 sentence, top-line answer to the user's question based on the quantitative data.
435
+
436
+ `### Key Findings`
437
+ - Use bullet points to highlight the main figures from the quantitative data. Interpret the numbers.
438
+
439
+ `### Key Drivers & Illustrative Examples`
440
+ - **This is the most important section.** Explain the "so what?" behind the numbers.
441
+ - Use the qualitative examples to explain *why* a category is high or low. Reference the top example document for each main category.
442
+
443
+ `### Deeper Dive: Suggested Follow-up Analyses`
444
+ - Propose 2-3 logical next questions based on your analysis to uncover deeper trends.
445
+
446
+ ---
447
+ **Generate the full report now, paying close attention to all formatting and spacing rules.**
448
+ """
449
+ try:
450
+ response_stream = llm_model.generate_content(prompt, stream=True)
451
+ for chunk in response_stream:
452
+ yield chunk.text
453
+ except Exception as e:
454
+ print(f"Error in llm_synthesize_enriched_report_stream: {e}")
455
+ yield "Sorry, I was unable to generate a report for this data."
456
+
457
+
458
+ def llm_generate_visualization_code(query_context, facet_data):
459
+ """Generates Python code for visualization based on query and data."""
460
+ prompt = f"""
461
+ You are a Python Data Visualization expert specializing in Matplotlib and Seaborn.
462
+ Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data.
463
+
464
+ **User's Analytical Goal:**
465
+ \"{query_context}\"
466
+
467
+ **Aggregated Data (from Solr Facets):**
468
+ ```json
469
+ {json.dumps(facet_data, indent=2)}
470
+ ```
471
+
472
+ ---
473
+ ### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES**
474
+ You MUST follow these rules to avoid errors.
475
+
476
+ **1. Identify the Data Structure FIRST:**
477
+ Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below.
478
+
479
+ * **Pattern A: Simple `terms` Facet.** The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts.
480
+ * **Pattern B: Multiple `query` Facets.** The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection").
481
+ * **Pattern C: Nested `terms` Facet.** The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`.
482
+
483
+ **2. Use the Correct Parsing Template:**
484
+
485
+ ---
486
+ **TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):**
487
+ ```python
488
+ import matplotlib.pyplot as plt
489
+ import seaborn as sns
490
+ import pandas as pd
491
+
492
+ plt.style.use('seaborn-v0_8-whitegrid')
493
+ fig, ax = plt.subplots(figsize=(12, 8))
494
+
495
+ # Dynamically find the main facet key (the one with 'buckets')
496
+ facet_key = None
497
+ for key, value in facet_data.items():
498
+ if isinstance(value, dict) and 'buckets' in value:
499
+ facet_key = key
500
+ break
501
+
502
+ if facet_key:
503
+ buckets = facet_data[facet_key].get('buckets', [])
504
+ # Check if buckets contain data
505
+ if buckets:
506
+ df = pd.DataFrame(buckets)
507
+ # Check for a nested metric or use 'count'
508
+ if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc):
509
+ # Example for nested sum metric
510
+ df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0))
511
+ y_axis_label = 'Sum of Total Deal Value'
512
+ else:
513
+ df.rename(columns={{'count': 'value'}}, inplace=True)
514
+ y_axis_label = 'Count'
515
+
516
+ sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis')
517
+ ax.set_xlabel('Category')
518
+ ax.set_ylabel(y_axis_label)
519
+ else:
520
+ ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')
521
+
522
+
523
+ ax.set_title('Your Insightful Title Here')
524
+ # Correct way to rotate labels to prevent errors
525
+ plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
526
+ plt.tight_layout()
527
+ ```
528
+ ---
529
+ **TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):**
530
+ ```python
531
+ import matplotlib.pyplot as plt
532
+ import seaborn as sns
533
+ import pandas as pd
534
+
535
+ plt.style.use('seaborn-v0_8-whitegrid')
536
+ fig, ax = plt.subplots(figsize=(10, 6))
537
+
538
+ labels = []
539
+ values = []
540
+ # Iterate through top-level keys, skipping the 'count'
541
+ for key, data_dict in facet_data.items():
542
+ if key == 'count' or not isinstance(data_dict, dict):
543
+ continue
544
+ # Extract the label (e.g., 'oral_deals' -> 'Oral')
545
+ label = key.replace('_deals', '').replace('_', ' ').title()
546
+ # Find the metric value, which is NOT 'count'
547
+ metric_value = 0
548
+ for sub_key, sub_value in data_dict.items():
549
+ if sub_key != 'count':
550
+ metric_value = sub_value
551
+ break # Found the metric
552
+ labels.append(label)
553
+ values.append(metric_value)
554
+
555
+ if labels:
556
+ sns.barplot(x=labels, y=values, ax=ax, palette='mako')
557
+ ax.set_ylabel('Total Deal Value') # Or other metric name
558
+ ax.set_xlabel('Category')
559
+ else:
560
+ ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center')
561
+
562
+
563
+ ax.set_title('Your Insightful Title Here')
564
+ plt.tight_layout()
565
+ ```
566
+ ---
567
+ **TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):**
568
+ ```python
569
+ import matplotlib.pyplot as plt
570
+ import seaborn as sns
571
+ import pandas as pd
572
+
573
+ plt.style.use('seaborn-v0_8-whitegrid')
574
+ fig, ax = plt.subplots(figsize=(14, 8))
575
+
576
+ # Find the key that has the buckets
577
+ facet_key = None
578
+ for key, value in facet_data.items():
579
+ if isinstance(value, dict) and 'buckets' in value:
580
+ facet_key = key
581
+ break
582
+
583
+ if facet_key and facet_data[facet_key].get('buckets'):
584
+ # This list comprehension is robust for parsing nested metrics
585
+ plot_data = []
586
+ for bucket in facet_data[facet_key]['buckets']:
587
+ category = bucket['val']
588
+ # Find all nested metrics (e.g., total_deal_value_2025)
589
+ for sub_key, sub_value in bucket.items():
590
+ if isinstance(sub_value, dict) and 'sum' in sub_value:
591
+ # Extracts year from 'total_deal_value_2025' -> '2025'
592
+ year = sub_key.split('_')[-1]
593
+ value = sub_value['sum']
594
+ plot_data.append({{'Category': category, 'Year': year, 'Value': value}})
595
+
596
+ if plot_data:
597
+ df = pd.DataFrame(plot_data)
598
+ sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax)
599
+ ax.set_ylabel('Total Deal Value')
600
+ ax.set_xlabel('Business Model')
601
+ # Correct way to rotate labels to prevent errors
602
+ plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
603
+ else:
604
+ ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center')
605
+ else:
606
+ ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')
607
+
608
+ ax.set_title('Your Insightful Title Here')
609
+ plt.tight_layout()
610
+ ```
611
+ ---
612
+ **3. Final Code Generation:**
613
+ - **DO NOT** include `plt.show()`.
614
+ - **DO** set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`.
615
+ - **DO NOT** wrap the code in ```python ... ```. Output only the raw Python code.
616
+ - Adapt the chosen template to the specific keys and metrics in the provided `facet_data`.
617
+
618
+ **Your Task:**
619
+ Now, generate the Python code.
620
+ """
621
+ try:
622
+ # Increase the timeout for potentially complex generation
623
+ generation_config = genai.types.GenerationConfig(temperature=0, max_output_tokens=2048)
624
+ response = llm_model.generate_content(prompt, generation_config=generation_config)
625
+ # Clean the response to remove markdown formatting
626
+ code = re.sub(r'^```python\s*|```$', '', response.text, flags=re.MULTILINE)
627
+ return code
628
+ except Exception as e:
629
+ print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {response.text}")
630
+ return None
631
+
632
+ def execute_viz_code_and_get_path(viz_code, facet_data):
633
+ """Executes visualization code and returns the path to the saved plot image."""
634
+ if not viz_code: return None
635
+ try:
636
+ if not os.path.exists('/tmp/plots'): os.makedirs('/tmp/plots')
637
+ plot_path = f"/tmp/plots/plot_{datetime.datetime.now().timestamp()}.png"
638
+ # The exec environment needs access to the required libraries and the data
639
+ exec_globals = {'facet_data': facet_data, 'plt': plt, 'sns': sns, 'pd': pd}
640
+ exec(viz_code, exec_globals)
641
+ fig = exec_globals.get('fig')
642
+ if fig:
643
+ fig.savefig(plot_path, bbox_inches='tight')
644
+ plt.close(fig) # Important to free up memory
645
+ return plot_path
646
+ return None
647
+ except Exception as e:
648
+ print(f"ERROR executing visualization code: {e}\n---Code---\n{viz_code}")
649
+ return None
650
+
651
+
652
+ def process_analysis_flow(user_input, history, state):
653
+ """
654
+ A generator that manages the conversation and yields tuples of UI updates for Gradio.
655
+ This version uses the dual-query (quantitative/qualitative) approach.
656
+ """
657
+ if state is None:
658
+ state = {'query_count': 0, 'last_suggestions': []}
659
+ if history is None:
660
+ history = []
661
+
662
+ # Reset UI for new analysis
663
+ yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False))
664
+
665
+ query_context = user_input.strip()
666
+ if not query_context:
667
+ history.append((user_input, "Please enter a question to analyze."))
668
+ yield (history, state, None, None, None, None, None)
669
+ return
670
+
671
+ # 1. Acknowledge and generate plan
672
+ history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating analysis plan...*"))
673
+ yield (history, state, None, None, None, None, None)
674
+
675
+ analysis_plan = llm_generate_analysis_plan_with_history(query_context, field_metadata, history)
676
+ if not analysis_plan:
677
+ history.append((None, "I'm sorry, I couldn't generate a valid analysis plan for that request. Please try rephrasing."))
678
+ yield (history, state, None, None, None, None, None)
679
+ return
680
+
681
+ history.append((None, "βœ… Analysis plan generated!"))
682
+ plan_summary = f"""
683
+ * **Analysis Dimension:** `{analysis_plan.get('analysis_dimension')}`
684
+ * **Analysis Measure:** `{analysis_plan.get('analysis_measure')}`
685
+ * **Query Filter:** `{analysis_plan.get('query_filter')}`
686
+ """
687
+ # Show the plan summary in the main chat
688
+ history.append((None, plan_summary))
689
+ # Put the full plan in the accordion
690
+ formatted_plan = f"**Full Analysis Plan:**\n```json\n{json.dumps(analysis_plan, indent=2)}\n```"
691
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
692
+
693
+
694
+ # 2. Execute Queries in Parallel
695
+ history.append((None, "*Executing queries for aggregates and examples...*"))
696
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
697
+
698
+ aggregate_data = None
699
+ example_data = None
700
+ with concurrent.futures.ThreadPoolExecutor() as executor:
701
+ future_agg = executor.submit(execute_quantitative_query, analysis_plan, solr_client)
702
+ future_ex = executor.submit(execute_qualitative_query, analysis_plan, solr_client)
703
+ aggregate_data = future_agg.result()
704
+ example_data = future_ex.result()
705
+
706
+ if not aggregate_data or aggregate_data.get('count', 0) == 0:
707
+ history.append((None, "No data was found for your query. Please try a different question."))
708
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
709
+ return
710
+
711
+ # Display retrieved data in accordions
712
+ formatted_agg_data = f"**Quantitative (Aggregate) Data:**\n```json\n{json.dumps(aggregate_data, indent=2)}\n```"
713
+ formatted_qual_data = f"**Qualitative (Example) Data:**\n```json\n{json.dumps(example_data, indent=2)}\n```"
714
+ qual_data_display_update = gr.update(value=formatted_qual_data, visible=True)
715
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
716
+
717
+
718
+ # 3. Generate Visualization (in parallel with report)
719
+ history.append((None, "βœ… Data retrieved. Generating visualization and final report..."))
720
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
721
+
722
+ with concurrent.futures.ThreadPoolExecutor() as executor:
723
+ viz_future = executor.submit(llm_generate_visualization_code, query_context, aggregate_data)
724
+
725
+ # 4. Generate and Stream Enriched Report
726
+ report_text = ""
727
+ stream_history = history[:]
728
+ for chunk in llm_synthesize_enriched_report_stream(query_context, aggregate_data, example_data, analysis_plan):
729
+ report_text += chunk
730
+ yield (stream_history, state, None, gr.update(value=report_text, visible=True), gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
731
+
732
+ history.append((None, report_text))
733
+
734
+ # Get visualization from future
735
+ viz_code = viz_future.result()
736
+ plot_path = execute_viz_code_and_get_path(viz_code, aggregate_data)
737
+ output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False)
738
+ if not plot_path:
739
+ history.append((None, "*I was unable to generate a plot for this data.*\n"))
740
+
741
+ yield (history, state, output_plot, report_text, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
742
+
743
+ # 5. Finalize
744
+ state['query_count'] += 1
745
+ state['last_suggestions'] = parse_suggestions_from_report(report_text)
746
+ next_prompt = "Analysis complete. What would you like to explore next?"
747
+ history.append((None, next_prompt))
748
+ yield (history, state, output_plot, report_text, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
749
+
750
+
751
+ # --- Gradio UI ---
752
+ with gr.Blocks(theme=gr.themes.Soft(), css="footer {display: none !important}") as demo:
753
+ state = gr.State()
754
+
755
+ with gr.Row():
756
+ with gr.Column(scale=4):
757
+ gr.Markdown("# πŸ’Š PharmaCircle AI Data Analyst")
758
+ with gr.Column(scale=1):
759
+ clear_button = gr.Button("πŸ”„ Start New Analysis", variant="primary")
760
+
761
+ gr.Markdown("Ask a question to begin your analysis. I will generate an analysis plan, retrieve quantitative and qualitative data, create a visualization, and write an enriched report.")
762
+
763
+ with gr.Row():
764
+ with gr.Column(scale=1):
765
+ chatbot = gr.Chatbot(label="Analysis Chat Log", height=700, show_copy_button=True)
766
+ msg_textbox = gr.Textbox(placeholder="Ask a question, e.g., 'Show me the top 5 companies by total deal value in 2023'", label="Your Question", interactive=True)
767
+
768
+ with gr.Column(scale=2):
769
+ with gr.Accordion("Generated Analysis Plan", open=False):
770
+ plan_display = gr.Markdown("Plan will appear here...", visible=True)
771
+ with gr.Accordion("Retrieved Quantitative Data", open=False):
772
+ quantitative_data_display = gr.Markdown("Aggregate data will appear here...", visible=False)
773
+ with gr.Accordion("Retrieved Qualitative Data (Examples)", open=False):
774
+ qualitative_data_display = gr.Markdown("Example data will appear here...", visible=False)
775
+ plot_display = gr.Image(label="Visualization", type="filepath", visible=False)
776
+ report_display = gr.Markdown("Report will be streamed here...", visible=False)
777
+
778
+ # --- Event Wiring ---
779
+ def reset_all():
780
+ """Resets the entire UI for a new analysis session."""
781
+ return (
782
+ [], # chatbot
783
+ None, # state
784
+ "", # msg_textbox
785
+ gr.update(value=None, visible=False), # plot_display
786
+ gr.update(value=None, visible=False), # report_display
787
+ gr.update(value=None, visible=False), # plan_display
788
+ gr.update(value=None, visible=False), # quantitative_data_display
789
+ gr.update(value=None, visible=False) # qualitative_data_display
790
+ )
791
+
792
+ msg_textbox.submit(
793
+ fn=process_analysis_flow,
794
+ inputs=[msg_textbox, chatbot, state],
795
+ outputs=[chatbot, state, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display],
796
+ ).then(
797
+ lambda: gr.update(value=""),
798
+ None,
799
+ [msg_textbox],
800
+ queue=False,
801
+ )
802
+
803
+ clear_button.click(
804
+ fn=reset_all,
805
+ inputs=None,
806
+ outputs=[chatbot, state, msg_textbox, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display],
807
+ queue=False
808
+ )
809
+
810
+ if is_initialized:
811
+ demo.queue().launch(debug=True, share=True)
812
+ else:
813
+ print("\nSkipping Gradio launch due to initialization errors.")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ pysolr
3
+ sshtunnel
4
+ google-generativeai
5
+ pandas
6
+ seaborn
7
+ matplotlib
8
+ IPython