wekey1998 commited on
Commit
25a13d5
·
verified ·
1 Parent(s): e5b0bb1

Update report.py

Browse files
Files changed (1) hide show
  1. report.py +469 -455
report.py CHANGED
@@ -1,606 +1,620 @@
1
  import logging
2
- from typing import Dict, List, Any, Optional
3
  import io
4
  from datetime import datetime
5
  import base64
6
 
7
- # PDF generation
 
 
 
 
8
  try:
9
- from reportlab.lib.pagesizes import letter, A4
10
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image
11
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
12
  from reportlab.lib.units import inch
13
  from reportlab.lib import colors
14
- from reportlab.graphics.shapes import Drawing
15
- from reportlab.graphics.charts.piecharts import Pie
16
- from reportlab.graphics.charts.barcharts import VerticalBarChart
17
  REPORTLAB_AVAILABLE = True
18
  except ImportError:
19
  REPORTLAB_AVAILABLE = False
20
 
21
- # Plotting for charts in PDF
 
 
 
 
 
 
22
  try:
23
  import matplotlib.pyplot as plt
24
  import matplotlib
25
- matplotlib.use('Agg') # Use non-interactive backend
26
  MATPLOTLIB_AVAILABLE = True
27
  except ImportError:
28
  MATPLOTLIB_AVAILABLE = False
29
 
30
- logger = logging.getLogger(__name__)
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def generate_pdf_report(results: Dict[str, Any]) -> io.BytesIO:
33
- """Generate a comprehensive PDF report"""
34
- if not REPORTLAB_AVAILABLE:
35
- logger.error("ReportLab not available for PDF generation")
 
 
 
 
 
 
 
 
 
36
  return _generate_simple_pdf_fallback(results)
37
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  try:
39
- # Create PDF buffer
40
- buffer = io.BytesIO()
41
-
42
- # Create document
43
- doc = SimpleDocTemplate(
44
- buffer,
45
- pagesize=A4,
46
- rightMargin=72,
47
- leftMargin=72,
48
- topMargin=72,
49
- bottomMargin=18
50
- )
51
-
52
- # Get styles
53
- styles = getSampleStyleSheet()
54
-
55
- # Create custom styles
56
- title_style = ParagraphStyle(
57
- 'CustomTitle',
58
- parent=styles['Heading1'],
59
- fontSize=24,
60
- spaceAfter=30,
61
- textColor=colors.HexColor('#2E86AB'),
62
- alignment=1 # Center
63
- )
64
-
65
- heading_style = ParagraphStyle(
66
- 'CustomHeading',
67
- parent=styles['Heading2'],
68
- fontSize=16,
69
- spaceAfter=12,
70
- spaceBefore=20,
71
- textColor=colors.HexColor('#2E86AB')
72
  )
73
-
74
- # Build story (content)
75
- story = []
76
-
77
- # Title page
78
- story.append(Paragraph("Global Business News Intelligence Report", title_style))
79
- story.append(Spacer(1, 0.5*inch))
80
-
81
- # Query and basic info
82
- story.append(Paragraph(f"Analysis Target: {results.get('query', 'N/A')}", styles['Normal']))
83
- story.append(Paragraph(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles['Normal']))
84
- story.append(Paragraph(f"Total Articles Analyzed: {results.get('total_articles', 0)}", styles['Normal']))
85
- story.append(Paragraph(f"Processing Time: {results.get('processing_time', 0):.2f} seconds", styles['Normal']))
86
- story.append(Spacer(1, 0.3*inch))
87
-
88
- # Executive Summary
89
- story.append(Paragraph("Executive Summary", heading_style))
90
- summary_text = _create_executive_summary(results)
91
- story.append(Paragraph(summary_text, styles['Normal']))
92
- story.append(Spacer(1, 0.2*inch))
93
-
94
- # Sentiment Analysis Section
95
- story.append(Paragraph("Sentiment Analysis", heading_style))
96
- sentiment_data = _create_sentiment_section(results, styles)
97
- story.extend(sentiment_data)
98
-
99
- # Top Stories Section
100
- story.append(Paragraph("Key Stories", heading_style))
101
- stories_data = _create_stories_section(results, styles)
102
- story.extend(stories_data)
103
-
104
- # Keywords Section
105
- if 'keywords' in results and results['keywords']:
106
- story.append(Paragraph("Key Topics and Themes", heading_style))
107
- keywords_data = _create_keywords_section(results, styles)
108
- story.extend(keywords_data)
109
-
110
- # Sources Section
111
- story.append(Paragraph("News Sources", heading_style))
112
- sources_data = _create_sources_section(results, styles)
113
- story.extend(sources_data)
114
-
115
- # Methodology Section
116
- story.append(Paragraph("Methodology", heading_style))
117
- methodology_text = _create_methodology_section(results)
118
- story.append(Paragraph(methodology_text, styles['Normal']))
119
-
120
- # Build PDF
121
- doc.build(story)
122
-
123
- buffer.seek(0)
124
- return buffer
125
-
126
- except Exception as e:
127
- logger.error(f"PDF generation failed: {str(e)}")
128
- return _generate_simple_pdf_fallback(results)
129
 
130
- def _create_executive_summary(results: Dict[str, Any]) -> str:
131
- """Create executive summary text"""
132
- try:
133
- query = results.get('query', 'the analyzed topic')
134
- total_articles = results.get('total_articles', 0)
135
- avg_sentiment = results.get('average_sentiment', 0)
136
-
137
- sentiment_label = "positive" if avg_sentiment > 0.1 else "negative" if avg_sentiment < -0.1 else "neutral"
138
-
139
- summary = f"This report analyzes {total_articles} news articles related to {query}. "
140
- summary += f"The overall sentiment analysis reveals a {sentiment_label} tone with an average sentiment score of {avg_sentiment:.3f}. "
141
-
142
- # Add sentiment distribution
143
- dist = results.get('sentiment_distribution', {})
144
- positive = dist.get('Positive', 0)
145
- negative = dist.get('Negative', 0)
146
- neutral = dist.get('Neutral', 0)
147
-
148
- summary += f"The analysis shows {positive} positive articles ({positive/total_articles*100:.1f}%), "
149
- summary += f"{negative} negative articles ({negative/total_articles*100:.1f}%), "
150
- summary += f"and {neutral} neutral articles ({neutral/total_articles*100:.1f}%). "
151
-
152
- # Add key insights
153
  if avg_sentiment > 0.2:
154
- summary += "The predominantly positive coverage suggests favorable market conditions or public perception."
155
  elif avg_sentiment < -0.2:
156
- summary += "The predominantly negative coverage indicates concerns or challenges that may require attention."
157
  else:
158
- summary += "The balanced sentiment coverage suggests a mixed outlook with both opportunities and challenges present."
159
-
160
  return summary
161
-
162
  except Exception as e:
163
- logger.error(f"Executive summary creation failed: {str(e)}")
164
  return "Analysis completed successfully with comprehensive sentiment evaluation across multiple news sources."
165
 
166
- def _create_sentiment_section(results: Dict[str, Any], styles) -> List:
167
- """Create sentiment analysis section"""
168
- story = []
169
-
170
  try:
171
- # Sentiment distribution table
172
- dist = results.get('sentiment_distribution', {})
173
- sentiment_data = [
 
 
174
  ['Sentiment', 'Count', 'Percentage'],
175
- ['Positive', str(dist.get('Positive', 0)), f"{dist.get('Positive', 0)/results.get('total_articles', 1)*100:.1f}%"],
176
- ['Negative', str(dist.get('Negative', 0)), f"{dist.get('Negative', 0)/results.get('total_articles', 1)*100:.1f}%"],
177
- ['Neutral', str(dist.get('Neutral', 0)), f"{dist.get('Neutral', 0)/results.get('total_articles', 1)*100:.1f}%"]
178
  ]
179
-
180
- sentiment_table = Table(sentiment_data)
181
- sentiment_table.setStyle(TableStyle([
182
  ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
183
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
184
  ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
185
  ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
186
  ('FONTSIZE', (0, 0), (-1, 0), 12),
187
- ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
188
  ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
189
- ('GRID', (0, 0), (-1, -1), 1, colors.black)
190
  ]))
191
-
192
- story.append(sentiment_table)
193
- story.append(Spacer(1, 0.2*inch))
194
-
195
- # Add sentiment analysis explanation
196
- explanation = "Sentiment analysis was performed using multiple models including VADER, Loughran-McDonald financial dictionary, and FinBERT. "
197
- explanation += "Scores range from -1.0 (most negative) to +1.0 (most positive), with scores between -0.1 and +0.1 considered neutral."
198
-
199
  story.append(Paragraph(explanation, styles['Normal']))
200
- story.append(Spacer(1, 0.2*inch))
201
-
202
  except Exception as e:
203
- logger.error(f"Sentiment section creation failed: {str(e)}")
204
  story.append(Paragraph("Sentiment analysis data unavailable.", styles['Normal']))
205
-
206
  return story
207
 
208
- def _create_stories_section(results: Dict[str, Any], styles) -> List:
209
- """Create top stories section"""
210
- story = []
211
-
212
  try:
213
- articles = results.get('articles', [])
214
  if not articles:
215
  story.append(Paragraph("No articles available for analysis.", styles['Normal']))
216
  return story
217
-
218
- # Sort articles by sentiment score
219
- sorted_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0), reverse=True)
220
-
221
- # Most positive story
222
- if sorted_articles and sorted_articles[0].get('sentiment', {}).get('compound', 0) > 0.1:
 
 
223
  story.append(Paragraph("Most Positive Coverage:", styles['Heading3']))
224
- top_positive = sorted_articles[0]
225
- story.append(Paragraph(f"<b>Title:</b> {top_positive.get('title', 'N/A')}", styles['Normal']))
226
- story.append(Paragraph(f"<b>Source:</b> {top_positive.get('source', 'N/A')}", styles['Normal']))
227
- story.append(Paragraph(f"<b>Sentiment Score:</b> {top_positive.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
228
- if 'summary' in top_positive:
229
- story.append(Paragraph(f"<b>Summary:</b> {top_positive['summary'][:300]}...", styles['Normal']))
230
- story.append(Spacer(1, 0.2*inch))
231
-
232
- # Most negative story
233
- negative_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0))
234
- if negative_articles and negative_articles[0].get('sentiment', {}).get('compound', 0) < -0.1:
235
  story.append(Paragraph("Most Negative Coverage:", styles['Heading3']))
236
- top_negative = negative_articles[0]
237
- story.append(Paragraph(f"<b>Title:</b> {top_negative.get('title', 'N/A')}", styles['Normal']))
238
- story.append(Paragraph(f"<b>Source:</b> {top_negative.get('source', 'N/A')}", styles['Normal']))
239
- story.append(Paragraph(f"<b>Sentiment Score:</b> {top_negative.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
240
- if 'summary' in top_negative:
241
- story.append(Paragraph(f"<b>Summary:</b> {top_negative['summary'][:300]}...", styles['Normal']))
242
- story.append(Spacer(1, 0.2*inch))
243
-
244
- # Recent stories (if dates available)
245
- recent_articles = [a for a in articles if a.get('date')]
246
- if recent_articles:
247
- recent_articles.sort(key=lambda x: x.get('date', ''), reverse=True)
248
- story.append(Paragraph("Most Recent Coverage:", styles['Heading3']))
249
- recent = recent_articles[0]
250
- story.append(Paragraph(f"<b>Title:</b> {recent.get('title', 'N/A')}", styles['Normal']))
251
- story.append(Paragraph(f"<b>Source:</b> {recent.get('source', 'N/A')}", styles['Normal']))
252
- story.append(Paragraph(f"<b>Date:</b> {recent.get('date', 'N/A')}", styles['Normal']))
253
- story.append(Paragraph(f"<b>Sentiment Score:</b> {recent.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
254
-
 
 
255
  except Exception as e:
256
- logger.error(f"Stories section creation failed: {str(e)}")
257
  story.append(Paragraph("Story analysis data unavailable.", styles['Normal']))
258
-
259
  return story
260
 
261
- def _create_keywords_section(results: Dict[str, Any], styles) -> List:
262
- """Create keywords section"""
263
- story = []
264
-
265
  try:
266
- keywords = results.get('keywords', [])[:15] # Top 15 keywords
267
-
268
- if not keywords:
269
  story.append(Paragraph("No keywords extracted.", styles['Normal']))
270
  return story
271
-
272
- # Create keywords table
273
- keyword_data = [['Keyword', 'Relevance Score', 'Category']]
274
-
275
- for kw in keywords:
276
  relevance = kw.get('relevance', 'medium')
277
- score = kw.get('score', 0)
278
- keyword_data.append([
279
- kw.get('keyword', 'N/A'),
280
- f"{score:.3f}",
281
- relevance.title()
282
- ])
283
-
284
- keyword_table = Table(keyword_data)
285
- keyword_table.setStyle(TableStyle([
286
  ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
287
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
288
  ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
289
  ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
290
  ('FONTSIZE', (0, 0), (-1, 0), 10),
291
- ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
292
  ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
293
- ('GRID', (0, 0), (-1, -1), 1, colors.black)
294
  ]))
295
-
296
- story.append(keyword_table)
297
- story.append(Spacer(1, 0.2*inch))
298
-
299
- # Keywords explanation
300
- explanation = "Keywords were extracted using the YAKE (Yet Another Keyword Extractor) algorithm, "
301
- explanation += "which identifies the most relevant terms and phrases based on statistical analysis of the text corpus."
302
-
303
- story.append(Paragraph(explanation, styles['Normal']))
304
-
305
  except Exception as e:
306
- logger.error(f"Keywords section creation failed: {str(e)}")
307
  story.append(Paragraph("Keyword analysis data unavailable.", styles['Normal']))
308
-
309
  return story
310
 
311
- def _create_sources_section(results: Dict[str, Any], styles) -> List:
312
- """Create news sources section"""
313
- story = []
314
-
315
  try:
316
- articles = results.get('articles', [])
317
-
318
  if not articles:
319
  story.append(Paragraph("No source data available.", styles['Normal']))
320
  return story
321
-
322
  # Count sources
323
- source_counts = {}
324
- for article in articles:
325
- source = article.get('source', 'Unknown')
326
- source_counts[source] = source_counts.get(source, 0) + 1
327
-
328
- # Create sources table
329
- source_data = [['News Source', 'Article Count', 'Percentage']]
330
- total_articles = len(articles)
331
-
332
- for source, count in sorted(source_counts.items(), key=lambda x: x[1], reverse=True):
333
- percentage = (count / total_articles) * 100
334
- source_data.append([source, str(count), f"{percentage:.1f}%"])
335
-
336
- sources_table = Table(source_data)
337
- sources_table.setStyle(TableStyle([
338
  ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
339
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
340
  ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
341
  ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
342
  ('FONTSIZE', (0, 0), (-1, 0), 10),
343
- ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
344
  ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
345
- ('GRID', (0, 0), (-1, -1), 1, colors.black)
346
  ]))
347
-
348
- story.append(sources_table)
349
- story.append(Spacer(1, 0.2*inch))
350
-
351
- # Sources explanation
352
- explanation = f"Articles were collected from {len(source_counts)} different news sources, "
353
- explanation += "providing diverse perspectives on the analyzed topic. Source diversity helps ensure comprehensive coverage and reduces bias."
354
-
355
- story.append(Paragraph(explanation, styles['Normal']))
356
-
357
  except Exception as e:
358
- logger.error(f"Sources section creation failed: {str(e)}")
359
  story.append(Paragraph("Source analysis data unavailable.", styles['Normal']))
360
-
361
  return story
362
 
363
- def _create_methodology_section(results: Dict[str, Any]) -> str:
364
- """Create methodology section text"""
365
- methodology = "This analysis employed a comprehensive natural language processing pipeline:\n\n"
366
-
367
- methodology += "1. <b>Data Collection:</b> News articles were scraped from multiple reliable sources using RSS feeds and web scraping techniques. "
368
- methodology += "Content was filtered for relevance and deduplicated to ensure quality.\n\n"
369
-
370
- methodology += "2. <b>Sentiment Analysis:</b> Three complementary models were used: "
371
- methodology += "VADER (general sentiment), Loughran-McDonald dictionary (financial sentiment), and FinBERT (financial domain-specific). "
372
- methodology += "Final scores represent a weighted combination of all models.\n\n"
373
-
374
- methodology += "3. <b>Text Processing:</b> Articles were cleaned, summarized using transformer models, and analyzed for key themes. "
375
- methodology += "Keyword extraction employed the YAKE algorithm for statistical relevance.\n\n"
376
-
377
- methodology += "4. <b>Quality Assurance:</b> All content was filtered for English language, minimum length requirements, and relevance to the query terms. "
378
- methodology += "Results were validated across multiple model outputs for consistency.\n\n"
379
-
380
- if results.get('processing_time'):
381
- methodology += f"Total processing time: {results['processing_time']:.2f} seconds for {results.get('total_articles', 0)} articles."
382
-
383
- return methodology
384
 
385
- def _generate_simple_pdf_fallback(results: Dict[str, Any]) -> io.BytesIO:
386
- """Generate a simple text-based PDF fallback"""
 
 
 
 
 
 
 
 
 
387
  try:
388
- from fpdf import FPDF
389
-
390
- pdf = FPDF()
391
- pdf.add_page()
392
- pdf.set_font('Arial', 'B', 16)
393
- pdf.cell(40, 10, 'News Analysis Report')
394
- pdf.ln(20)
395
-
396
- pdf.set_font('Arial', '', 12)
397
- pdf.cell(40, 10, f"Query: {results.get('query', 'N/A')}")
398
- pdf.ln(10)
399
- pdf.cell(40, 10, f"Articles: {results.get('total_articles', 0)}")
400
- pdf.ln(10)
401
- pdf.cell(40, 10, f"Average Sentiment: {results.get('average_sentiment', 0):.3f}")
402
- pdf.ln(20)
403
-
404
- # Simple sentiment distribution
405
- dist = results.get('sentiment_distribution', {})
406
- pdf.cell(40, 10, 'Sentiment Distribution:')
407
- pdf.ln(10)
408
- pdf.cell(40, 10, f"Positive: {dist.get('Positive', 0)}")
409
- pdf.ln(10)
410
- pdf.cell(40, 10, f"Negative: {dist.get('Negative', 0)}")
411
- pdf.ln(10)
412
- pdf.cell(40, 10, f"Neutral: {dist.get('Neutral', 0)}")
413
-
414
- # Save to buffer
415
- buffer = io.BytesIO()
416
- pdf_string = pdf.output(dest='S').encode('latin1')
417
- buffer.write(pdf_string)
418
- buffer.seek(0)
419
-
420
- return buffer
421
-
422
- except Exception as e:
423
- logger.error(f"PDF fallback failed: {str(e)}")
424
- # Return empty buffer as last resort
425
- buffer = io.BytesIO()
426
- buffer.write(b"PDF generation failed. Please check logs.")
427
- buffer.seek(0)
428
- return buffer
429
 
 
 
 
 
 
 
 
 
 
 
430
  def create_chart_image(data: Dict, chart_type: str = 'pie') -> Optional[str]:
431
- """Create a chart image for PDF inclusion"""
432
  if not MATPLOTLIB_AVAILABLE:
433
  return None
434
-
435
  try:
436
  plt.figure(figsize=(6, 4))
437
-
438
- if chart_type == 'pie' and 'sentiment_distribution' in data:
439
- dist = data['sentiment_distribution']
440
  labels = ['Positive', 'Negative', 'Neutral']
441
- sizes = [dist.get('Positive', 0), dist.get('Negative', 0), dist.get('Neutral', 0)]
442
- colors = ['#28a745', '#dc3545', '#6c757d']
443
-
444
- plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
 
 
445
  plt.title('Sentiment Distribution')
446
-
447
  elif chart_type == 'bar' and 'articles' in data:
448
- articles = data['articles']
449
- sources = {}
450
- for article in articles:
451
- source = article.get('source', 'Unknown')
452
- sources[source] = sources.get(source, 0) + 1
453
-
454
- # Top 10 sources
455
- top_sources = dict(sorted(sources.items(), key=lambda x: x[1], reverse=True)[:10])
456
-
457
- plt.bar(range(len(top_sources)), list(top_sources.values()), color='#2E86AB')
458
- plt.xticks(range(len(top_sources)), list(top_sources.keys()), rotation=45, ha='right')
459
  plt.title('Articles by Source')
460
- plt.ylabel('Article Count')
461
  plt.tight_layout()
462
-
463
- # Save to base64 string
464
- buffer = io.BytesIO()
465
- plt.savefig(buffer, format='png', dpi=150, bbox_inches='tight')
466
- buffer.seek(0)
467
-
468
- image_base64 = base64.b64encode(buffer.getvalue()).decode()
469
  plt.close()
470
-
471
- return image_base64
472
-
473
  except Exception as e:
474
- logger.error(f"Chart creation failed: {str(e)}")
475
  return None
476
 
 
 
 
 
477
  def generate_csv_report(results: Dict[str, Any]) -> str:
478
- """Generate CSV report"""
479
  try:
480
  import csv
481
- import io
482
-
483
- output = io.StringIO()
484
- writer = csv.writer(output)
485
-
486
- # Write header
487
- writer.writerow([
488
- 'Title', 'Source', 'URL', 'Date', 'Sentiment_Score', 'Sentiment_Label',
489
- 'VADER_Score', 'LM_Score', 'FinBERT_Score', 'Summary'
490
- ])
491
-
492
- # Write article data
493
- articles = results.get('articles', [])
494
- for article in articles:
495
- sentiment = article.get('sentiment', {})
496
- compound = sentiment.get('compound', 0)
497
-
498
- # Determine sentiment label
499
  if compound > 0.1:
500
  label = 'Positive'
501
  elif compound < -0.1:
502
  label = 'Negative'
503
  else:
504
  label = 'Neutral'
505
-
506
- writer.writerow([
507
- article.get('title', ''),
508
- article.get('source', ''),
509
- article.get('url', ''),
510
- article.get('date', ''),
511
  compound,
512
  label,
513
- sentiment.get('vader', ''),
514
- sentiment.get('loughran_mcdonald', ''),
515
- sentiment.get('finbert', ''),
516
- article.get('summary', '')[:200] + '...' if len(article.get('summary', '')) > 200 else article.get('summary', '')
517
  ])
518
-
519
- return output.getvalue()
520
-
521
  except Exception as e:
522
- logger.error(f"CSV generation failed: {str(e)}")
523
  return "Error generating CSV report"
524
 
 
525
  def generate_json_report(results: Dict[str, Any]) -> str:
526
- """Generate JSON report with formatted output"""
527
  try:
528
  import json
529
- from datetime import datetime
530
-
531
- # Create comprehensive report
 
 
 
 
 
 
 
 
 
532
  report = {
533
- 'metadata': {
534
- 'report_generated': datetime.now().isoformat(),
535
- 'query': results.get('query', ''),
536
- 'total_articles': results.get('total_articles', 0),
537
- 'processing_time_seconds': results.get('processing_time', 0),
538
- 'languages': results.get('languages', ['English'])
539
- },
540
- 'summary': {
541
- 'average_sentiment': results.get('average_sentiment', 0),
542
- 'sentiment_distribution': results.get('sentiment_distribution', {}),
543
- 'top_sources': _get_top_sources(results),
544
- 'date_range': results.get('summary', {}).get('date_range', {})
545
- },
546
  'articles': results.get('articles', []),
547
- 'keywords': results.get('keywords', [])[:20], # Top 20 keywords
548
  'analysis_methods': {
549
  'sentiment_models': ['VADER', 'Loughran-McDonald', 'FinBERT'],
550
- 'summarization_model': 'DistilBART',
551
  'keyword_extraction': 'YAKE',
552
  'translation_models': ['Helsinki-NLP Opus-MT']
553
  }
554
  }
555
-
556
  return json.dumps(report, indent=2, default=str, ensure_ascii=False)
557
-
558
  except Exception as e:
559
- logger.error(f"JSON generation failed: {str(e)}")
560
- return json.dumps({'error': str(e)}, indent=2)
 
 
 
 
 
561
 
562
  def _get_top_sources(results: Dict[str, Any]) -> List[Dict[str, Any]]:
563
- """Get top news sources from results"""
564
  try:
565
- articles = results.get('articles', [])
566
- sources = {}
567
-
568
- for article in articles:
569
- source = article.get('source', 'Unknown')
570
- sources[source] = sources.get(source, 0) + 1
571
-
572
- # Convert to list and sort
573
- source_list = [
574
- {'source': source, 'count': count, 'percentage': round((count / len(articles)) * 100, 1)}
575
- for source, count in sources.items()
576
  ]
577
-
578
- return sorted(source_list, key=lambda x: x['count'], reverse=True)[:10]
579
-
580
  except Exception as e:
581
- logger.error(f"Top sources calculation failed: {str(e)}")
582
  return []
583
 
 
584
  def validate_report_data(results: Dict[str, Any]) -> bool:
585
- """Validate that results contain required data for reporting"""
586
- required_keys = ['query', 'articles', 'total_articles']
587
-
588
- for key in required_keys:
589
- if key not in results:
590
- logger.error(f"Missing required key for reporting: {key}")
591
- return False
592
-
593
  if not isinstance(results['articles'], list) or len(results['articles']) == 0:
594
  logger.error("No articles available for reporting")
595
  return False
596
-
597
  return True
598
 
599
- # Export functions
600
  __all__ = [
601
  'generate_pdf_report',
602
- 'generate_csv_report',
603
  'generate_json_report',
604
  'create_chart_image',
605
- 'validate_report_data'
606
- ]
 
1
  import logging
2
+ from typing import Dict, List, Any, Optional, Tuple
3
  import io
4
  from datetime import datetime
5
  import base64
6
 
7
+ logger = logging.getLogger(__name__)
8
+
9
+ # -------------------------------
10
+ # Optional PDF backends
11
+ # -------------------------------
12
  try:
13
+ from reportlab.lib.pagesizes import A4
14
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
15
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
16
  from reportlab.lib.units import inch
17
  from reportlab.lib import colors
 
 
 
18
  REPORTLAB_AVAILABLE = True
19
  except ImportError:
20
  REPORTLAB_AVAILABLE = False
21
 
22
+ try:
23
+ from fpdf import FPDF
24
+ FPDF_AVAILABLE = True
25
+ except ImportError:
26
+ FPDF_AVAILABLE = False
27
+
28
+ # Optional plotting for chart images (base64)
29
  try:
30
  import matplotlib.pyplot as plt
31
  import matplotlib
32
+ matplotlib.use('Agg')
33
  MATPLOTLIB_AVAILABLE = True
34
  except ImportError:
35
  MATPLOTLIB_AVAILABLE = False
36
 
 
37
 
38
+ # -------------------------------
39
+ # Small helpers
40
+ # -------------------------------
41
+ def _safe_div(a: float, b: float) -> float:
42
+ try:
43
+ return (a / b) if b else 0.0
44
+ except Exception:
45
+ return 0.0
46
+
47
+
48
+ def _norm_dist_from_results(results: Dict[str, Any]) -> Tuple[int, Dict[str, int], float]:
49
+ """
50
+ Normalize fields from both the legacy structure and the new API structure.
51
+ Returns:
52
+ total_articles,
53
+ counts dict {'Positive': int, 'Negative': int, 'Neutral': int},
54
+ average_sentiment (float)
55
+ """
56
+ # Prefer the new API shape: results["summary"]["distribution"] etc.
57
+ articles = results.get("articles", []) or []
58
+ total = results.get("total_articles") or len(articles) # backfill if missing
59
+
60
+ avg = 0.0
61
+ if "summary" in results:
62
+ avg = results["summary"].get("average_sentiment", 0.0) or 0.0
63
+ dist = results["summary"].get("distribution", {}) or {}
64
+ pos = dist.get("positive") or dist.get("Positive") or 0
65
+ neg = dist.get("negative") or dist.get("Negative") or 0
66
+ neu = dist.get("neutral") or dist.get("Neutral") or 0
67
+ else:
68
+ # Legacy keys (if present)
69
+ avg = results.get("average_sentiment", 0.0) or 0.0
70
+ legacy = results.get("sentiment_distribution", {}) or {}
71
+ pos = legacy.get("Positive") or legacy.get("positive") or 0
72
+ neg = legacy.get("Negative") or legacy.get("negative") or 0
73
+ neu = legacy.get("Neutral") or legacy.get("neutral") or 0
74
+
75
+ # If counts are 0 but we have articles, compute from article sentiments
76
+ if (pos + neg + neu == 0) and articles:
77
+ for a in articles:
78
+ c = (a.get("sentiment") or {}).get("compound", 0.0)
79
+ if c > 0.1:
80
+ pos += 1
81
+ elif c < -0.1:
82
+ neg += 1
83
+ else:
84
+ neu += 1
85
+
86
+ return total, {"Positive": pos, "Negative": neg, "Neutral": neu}, float(avg)
87
+
88
+
89
+ def _get_processing_time(results: Dict[str, Any]) -> float:
90
+ # New structure: results["summary"]["processing"]["processing_time_seconds"]
91
+ try:
92
+ return float(results.get("summary", {}).get("processing", {}).get("processing_time_seconds", 0.0))
93
+ except Exception:
94
+ pass
95
+ # Legacy field
96
+ try:
97
+ return float(results.get("processing_time", 0.0))
98
+ except Exception:
99
+ return 0.0
100
+
101
+
102
+ # -------------------------------
103
+ # Public API
104
+ # -------------------------------
105
  def generate_pdf_report(results: Dict[str, Any]) -> io.BytesIO:
106
+ """
107
+ Generate a comprehensive PDF report.
108
+ Returns a BytesIO buffer so Streamlit can download directly.
109
+ """
110
+ if REPORTLAB_AVAILABLE:
111
+ try:
112
+ return _generate_pdf_with_reportlab(results)
113
+ except Exception as e:
114
+ logger.exception(f"ReportLab PDF generation failed: {e}")
115
+
116
+ # Fallback
117
+ if FPDF_AVAILABLE:
118
  return _generate_simple_pdf_fallback(results)
119
+
120
+ # Last resort: a tiny text buffer
121
+ buf = io.BytesIO()
122
+ buf.write(b"PDF generation is unavailable (ReportLab/FPDF not installed).")
123
+ buf.seek(0)
124
+ return buf
125
+
126
+
127
+ # -------------------------------
128
+ # ReportLab implementation
129
+ # -------------------------------
130
+ def _generate_pdf_with_reportlab(results: Dict[str, Any]) -> io.BytesIO:
131
+ buffer = io.BytesIO()
132
+
133
+ doc = SimpleDocTemplate(
134
+ buffer,
135
+ pagesize=A4,
136
+ rightMargin=72,
137
+ leftMargin=72,
138
+ topMargin=72,
139
+ bottomMargin=18,
140
+ )
141
+
142
+ styles = getSampleStyleSheet()
143
+ title_style = ParagraphStyle(
144
+ 'CustomTitle',
145
+ parent=styles['Heading1'],
146
+ fontSize=22,
147
+ spaceAfter=24,
148
+ textColor=colors.HexColor('#2E86AB'),
149
+ alignment=1 # Center
150
+ )
151
+ heading_style = ParagraphStyle(
152
+ 'CustomHeading',
153
+ parent=styles['Heading2'],
154
+ fontSize=14,
155
+ spaceAfter=10,
156
+ spaceBefore=18,
157
+ textColor=colors.HexColor('#2E86AB')
158
+ )
159
+
160
+ story: List[Any] = []
161
+
162
+ # Title
163
+ query = results.get('query', 'N/A')
164
+ story.append(Paragraph(f"Global Business News Intelligence Report", title_style))
165
+ story.append(Spacer(1, 0.35 * inch))
166
+ story.append(Paragraph(f"Analysis Target: {query}", styles['Normal']))
167
+ story.append(Paragraph(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles['Normal']))
168
+
169
+ total, dist_counts, avg = _norm_dist_from_results(results)
170
+ proc_time = _get_processing_time(results)
171
+ story.append(Paragraph(f"Total Articles Analyzed: {total}", styles['Normal']))
172
+ story.append(Paragraph(f"Processing Time: {proc_time:.2f} seconds", styles['Normal']))
173
+ story.append(Spacer(1, 0.25 * inch))
174
+
175
+ # Executive Summary
176
+ story.append(Paragraph("Executive Summary", heading_style))
177
+ story.append(Paragraph(_create_executive_summary(query, total, avg, dist_counts), styles['Normal']))
178
+ story.append(Spacer(1, 0.2 * inch))
179
+
180
+ # Sentiment Analysis
181
+ story.append(Paragraph("Sentiment Analysis", heading_style))
182
+ story.extend(_create_sentiment_section(total, dist_counts, styles))
183
+
184
+ # Key Stories
185
+ story.append(Paragraph("Key Stories", heading_style))
186
+ story.extend(_create_stories_section(results, styles))
187
+
188
+ # Keywords
189
+ keywords = results.get('keywords') or []
190
+ if keywords:
191
+ story.append(Paragraph("Key Topics and Themes", heading_style))
192
+ story.extend(_create_keywords_section(keywords, styles))
193
+
194
+ # Sources
195
+ story.append(Paragraph("News Sources", heading_style))
196
+ story.extend(_create_sources_section(results, styles))
197
+
198
+ # Methodology
199
+ story.append(Paragraph("Methodology", heading_style))
200
+ story.append(Paragraph(_create_methodology_section(results, total, proc_time), styles['Normal']))
201
+
202
+ doc.build(story)
203
+ buffer.seek(0)
204
+ return buffer
205
+
206
+
207
+ def _create_executive_summary(query: str, total: int, avg_sentiment: float, dist_counts: Dict[str, int]) -> str:
208
  try:
209
+ if total == 0:
210
+ return f"No articles were available to analyze for “{query}”."
211
+
212
+ label = "positive" if avg_sentiment > 0.1 else "negative" if avg_sentiment < -0.1 else "neutral"
213
+
214
+ pos = dist_counts.get("Positive", 0)
215
+ neg = dist_counts.get("Negative", 0)
216
+ neu = dist_counts.get("Neutral", 0)
217
+
218
+ pct_pos = _safe_div(pos, total) * 100.0
219
+ pct_neg = _safe_div(neg, total) * 100.0
220
+ pct_neu = _safe_div(neu, total) * 100.0
221
+
222
+ summary = (
223
+ f"This report analyzes {total} news articles related to “{query}”. "
224
+ f"The overall sentiment reveals a {label} tone with an average sentiment score of {avg_sentiment:.3f}. "
225
+ f"The analysis shows {pos} positive articles ({pct_pos:.1f}%), "
226
+ f"{neg} negative articles ({pct_neg:.1f}%), and {neu} neutral articles ({pct_neu:.1f}%). "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  if avg_sentiment > 0.2:
230
+ summary += "Predominantly positive coverage suggests favorable market conditions or public perception."
231
  elif avg_sentiment < -0.2:
232
+ summary += "Predominantly negative coverage indicates concerns or challenges that may require attention."
233
  else:
234
+ summary += "Balanced coverage suggests a mixed outlook with both opportunities and challenges."
 
235
  return summary
 
236
  except Exception as e:
237
+ logger.exception(f"Executive summary creation failed: {e}")
238
  return "Analysis completed successfully with comprehensive sentiment evaluation across multiple news sources."
239
 
240
+
241
+ def _create_sentiment_section(total: int, dist_counts: Dict[str, int], styles) -> List[Any]:
242
+ story: List[Any] = []
 
243
  try:
244
+ pos = dist_counts.get("Positive", 0)
245
+ neg = dist_counts.get("Negative", 0)
246
+ neu = dist_counts.get("Neutral", 0)
247
+
248
+ data = [
249
  ['Sentiment', 'Count', 'Percentage'],
250
+ ['Positive', str(pos), f"{_safe_div(pos, total) * 100:.1f}%"],
251
+ ['Negative', str(neg), f"{_safe_div(neg, total) * 100:.1f}%"],
252
+ ['Neutral', str(neu), f"{_safe_div(neu, total) * 100:.1f}%"],
253
  ]
254
+
255
+ table = Table(data)
256
+ table.setStyle(TableStyle([
257
  ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
258
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
259
  ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
260
  ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
261
  ('FONTSIZE', (0, 0), (-1, 0), 12),
262
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 10),
263
  ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
264
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
265
  ]))
266
+ story.append(table)
267
+ story.append(Spacer(1, 0.2 * inch))
268
+
269
+ explanation = (
270
+ "Sentiment analysis was performed using multiple models including VADER, "
271
+ "Loughran–McDonald (financial), and FinBERT. Scores range from -1.0 (most negative) "
272
+ "to +1.0 (most positive), with -0.1 to +0.1 considered neutral."
273
+ )
274
  story.append(Paragraph(explanation, styles['Normal']))
275
+ story.append(Spacer(1, 0.1 * inch))
 
276
  except Exception as e:
277
+ logger.exception(f"Sentiment section creation failed: {e}")
278
  story.append(Paragraph("Sentiment analysis data unavailable.", styles['Normal']))
 
279
  return story
280
 
281
+
282
+ def _create_stories_section(results: Dict[str, Any], styles) -> List[Any]:
283
+ story: List[Any] = []
 
284
  try:
285
+ articles = results.get('articles', []) or []
286
  if not articles:
287
  story.append(Paragraph("No articles available for analysis.", styles['Normal']))
288
  return story
289
+
290
+ # Sort by compound sentiment
291
+ sorted_by_pos = sorted(articles, key=lambda x: (x.get('sentiment') or {}).get('compound', 0.0), reverse=True)
292
+ sorted_by_neg = sorted(articles, key=lambda x: (x.get('sentiment') or {}).get('compound', 0.0))
293
+
294
+ # Most positive
295
+ if sorted_by_pos and (sorted_by_pos[0].get('sentiment') or {}).get('compound', 0.0) > 0.1:
296
+ a = sorted_by_pos[0]
297
  story.append(Paragraph("Most Positive Coverage:", styles['Heading3']))
298
+ story.append(Paragraph(f"<b>Title:</b> {a.get('title','N/A')}", styles['Normal']))
299
+ story.append(Paragraph(f"<b>Source:</b> {a.get('source','N/A')}", styles['Normal']))
300
+ story.append(Paragraph(f"<b>Sentiment Score:</b> {(a.get('sentiment') or {}).get('compound', 0.0):.3f}", styles['Normal']))
301
+ if a.get('summary'):
302
+ story.append(Paragraph(f"<b>Summary:</b> {a['summary'][:300]}{'...' if len(a['summary'])>300 else ''}", styles['Normal']))
303
+ story.append(Spacer(1, 0.15 * inch))
304
+
305
+ # Most negative
306
+ if sorted_by_neg and (sorted_by_neg[0].get('sentiment') or {}).get('compound', 0.0) < -0.1:
307
+ a = sorted_by_neg[0]
 
308
  story.append(Paragraph("Most Negative Coverage:", styles['Heading3']))
309
+ story.append(Paragraph(f"<b>Title:</b> {a.get('title','N/A')}", styles['Normal']))
310
+ story.append(Paragraph(f"<b>Source:</b> {a.get('source','N/A')}", styles['Normal']))
311
+ story.append(Paragraph(f"<b>Sentiment Score:</b> {(a.get('sentiment') or {}).get('compound', 0.0):.3f}", styles['Normal']))
312
+ if a.get('summary'):
313
+ story.append(Paragraph(f"<b>Summary:</b> {a['summary'][:300]}{'...' if len(a['summary'])>300 else ''}", styles['Normal']))
314
+
315
+ # Latest coverage (if dates are present)
316
+ recent = [a for a in articles if a.get('date')]
317
+ if recent:
318
+ try:
319
+ recent.sort(key=lambda x: x.get('date'), reverse=True)
320
+ r = recent[0]
321
+ story.append(Spacer(1, 0.15 * inch))
322
+ story.append(Paragraph("Most Recent Coverage:", styles['Heading3']))
323
+ story.append(Paragraph(f"<b>Title:</b> {r.get('title','N/A')}", styles['Normal']))
324
+ story.append(Paragraph(f"<b>Source:</b> {r.get('source','N/A')}", styles['Normal']))
325
+ story.append(Paragraph(f"<b>Date:</b> {r.get('date')}", styles['Normal']))
326
+ story.append(Paragraph(f"<b>Sentiment Score:</b> {(r.get('sentiment') or {}).get('compound', 0.0):.3f}", styles['Normal']))
327
+ except Exception:
328
+ pass
329
+
330
  except Exception as e:
331
+ logger.exception(f"Stories section creation failed: {e}")
332
  story.append(Paragraph("Story analysis data unavailable.", styles['Normal']))
 
333
  return story
334
 
335
+
336
+ def _create_keywords_section(keywords: List[Dict[str, Any]], styles) -> List[Any]:
337
+ story: List[Any] = []
 
338
  try:
339
+ top = keywords[:15]
340
+ if not top:
 
341
  story.append(Paragraph("No keywords extracted.", styles['Normal']))
342
  return story
343
+
344
+ data = [['Keyword', 'Score', 'Category']]
345
+ for kw in top:
346
+ score = kw.get('score', 0.0)
 
347
  relevance = kw.get('relevance', 'medium')
348
+ data.append([kw.get('keyword', 'N/A'), f"{score:.3f}", str(relevance).title()])
349
+
350
+ table = Table(data)
351
+ table.setStyle(TableStyle([
 
 
 
 
 
352
  ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
353
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
354
  ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
355
  ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
356
  ('FONTSIZE', (0, 0), (-1, 0), 10),
357
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 10),
358
  ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
359
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
360
  ]))
361
+ story.append(table)
362
+ story.append(Spacer(1, 0.15 * inch))
363
+
364
+ expl = ("Keywords were extracted using the YAKE algorithm, which identifies relevant terms and phrases "
365
+ "based on statistical features of the text corpus.")
366
+ story.append(Paragraph(expl, styles['Normal']))
 
 
 
 
367
  except Exception as e:
368
+ logger.exception(f"Keywords section creation failed: {e}")
369
  story.append(Paragraph("Keyword analysis data unavailable.", styles['Normal']))
 
370
  return story
371
 
372
+
373
+ def _create_sources_section(results: Dict[str, Any], styles) -> List[Any]:
374
+ story: List[Any] = []
 
375
  try:
376
+ articles = results.get('articles', []) or []
 
377
  if not articles:
378
  story.append(Paragraph("No source data available.", styles['Normal']))
379
  return story
380
+
381
  # Count sources
382
+ counts: Dict[str, int] = {}
383
+ for a in articles:
384
+ src = a.get('source', 'Unknown')
385
+ counts[src] = counts.get(src, 0) + 1
386
+
387
+ total = len(articles)
388
+ data = [['News Source', 'Article Count', 'Percentage']]
389
+ for src, ct in sorted(counts.items(), key=lambda x: x[1], reverse=True):
390
+ data.append([src, str(ct), f"{_safe_div(ct, total) * 100:.1f}%"])
391
+
392
+ table = Table(data)
393
+ table.setStyle(TableStyle([
 
 
 
394
  ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
395
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
396
  ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
397
  ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
398
  ('FONTSIZE', (0, 0), (-1, 0), 10),
399
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 10),
400
  ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
401
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
402
  ]))
403
+ story.append(table)
404
+ story.append(Spacer(1, 0.15 * inch))
405
+
406
+ expl = (f"Articles were collected from {len(counts)} different sources, providing diverse perspectives. "
407
+ "Source diversity helps ensure comprehensive coverage and reduces bias.")
408
+ story.append(Paragraph(expl, styles['Normal']))
 
 
 
 
409
  except Exception as e:
410
+ logger.exception(f"Sources section creation failed: {e}")
411
  story.append(Paragraph("Source analysis data unavailable.", styles['Normal']))
 
412
  return story
413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
+ def _create_methodology_section(results: Dict[str, Any], total: int, proc_time: float) -> str:
416
+ meth = (
417
+ "This analysis employed a comprehensive NLP pipeline:\n\n"
418
+ "1. <b>Data Collection:</b> Articles were gathered from multiple RSS/business feeds. "
419
+ "Content was filtered for relevance and deduplicated.\n\n"
420
+ "2. <b>Sentiment Analysis:</b> VADER (general), Loughran–McDonald (finance), and FinBERT (finance) were combined. "
421
+ "Final scores reflect a weighted composite.\n\n"
422
+ "3. <b>Summarization & Keywords:</b> Articles were cleaned and summarized (transformer models when available), "
423
+ "and key themes extracted with YAKE.\n\n"
424
+ "4. <b>Quality Controls:</b> English-only filtering, minimum length checks, and relevance filters.\n\n"
425
+ )
426
  try:
427
+ meth += f"Processed {total} articles in {proc_time:.2f} seconds."
428
+ except Exception:
429
+ pass
430
+ return meth
431
+
432
+
433
+ # -------------------------------
434
+ # FPDF fallback
435
+ # -------------------------------
436
+ def _generate_simple_pdf_fallback(results: Dict[str, Any]) -> io.BytesIO:
437
+ total, dist_counts, avg = _norm_dist_from_results(results)
438
+ query = results.get('query', 'N/A')
439
+
440
+ pdf = FPDF()
441
+ pdf.add_page()
442
+ pdf.set_font('Arial', 'B', 16)
443
+ pdf.cell(0, 10, 'News Analysis Report', ln=True)
444
+ pdf.ln(5)
445
+
446
+ pdf.set_font('Arial', '', 12)
447
+ pdf.cell(0, 8, f"Query: {query}", ln=True)
448
+ pdf.cell(0, 8, f"Articles: {total}", ln=True)
449
+ pdf.cell(0, 8, f"Average Sentiment: {avg:.3f}", ln=True)
450
+ pdf.ln(5)
451
+
452
+ pos, neg, neu = dist_counts.get("Positive", 0), dist_counts.get("Negative", 0), dist_counts.get("Neutral", 0)
453
+ pdf.cell(0, 8, "Sentiment Distribution:", ln=True)
454
+ pdf.cell(0, 8, f" Positive: {pos} ({_safe_div(pos, total)*100:.1f}%)", ln=True)
455
+ pdf.cell(0, 8, f" Negative: {neg} ({_safe_div(neg, total)*100:.1f}%)", ln=True)
456
+ pdf.cell(0, 8, f" Neutral: {neu} ({_safe_div(neu, total)*100:.1f}%)", ln=True)
 
 
 
 
 
 
 
 
 
 
 
457
 
458
+ buf = io.BytesIO()
459
+ pdf_bytes = pdf.output(dest='S').encode('latin1')
460
+ buf.write(pdf_bytes)
461
+ buf.seek(0)
462
+ return buf
463
+
464
+
465
+ # -------------------------------
466
+ # Optional chart image (base64)
467
+ # -------------------------------
468
  def create_chart_image(data: Dict, chart_type: str = 'pie') -> Optional[str]:
 
469
  if not MATPLOTLIB_AVAILABLE:
470
  return None
 
471
  try:
472
  plt.figure(figsize=(6, 4))
473
+ if chart_type == 'pie':
474
+ # Support both shapes
475
+ total, dist_counts, _ = _norm_dist_from_results(data if 'articles' in data else {'summary': {'distribution': data}})
476
  labels = ['Positive', 'Negative', 'Neutral']
477
+ sizes = [
478
+ dist_counts.get('Positive', 0),
479
+ dist_counts.get('Negative', 0),
480
+ dist_counts.get('Neutral', 0),
481
+ ]
482
+ plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
483
  plt.title('Sentiment Distribution')
 
484
  elif chart_type == 'bar' and 'articles' in data:
485
+ sources: Dict[str, int] = {}
486
+ for a in data.get('articles', []):
487
+ s = a.get('source', 'Unknown')
488
+ sources[s] = sources.get(s, 0) + 1
489
+ top = dict(sorted(sources.items(), key=lambda x: x[1], reverse=True)[:10])
490
+ plt.bar(range(len(top)), list(top.values()))
491
+ plt.xticks(range(len(top)), list(top.keys()), rotation=45, ha='right')
 
 
 
 
492
  plt.title('Articles by Source')
493
+ plt.ylabel('Count')
494
  plt.tight_layout()
495
+
496
+ buf = io.BytesIO()
497
+ plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
498
+ buf.seek(0)
499
+ img64 = base64.b64encode(buf.getvalue()).decode()
 
 
500
  plt.close()
501
+ return img64
 
 
502
  except Exception as e:
503
+ logger.exception(f"Chart creation failed: {e}")
504
  return None
505
 
506
+
507
+ # -------------------------------
508
+ # CSV / JSON helpers (unchanged public API)
509
+ # -------------------------------
510
  def generate_csv_report(results: Dict[str, Any]) -> str:
 
511
  try:
512
  import csv
513
+ import io as _io
514
+ out = _io.StringIO()
515
+ w = csv.writer(out)
516
+ w.writerow(['Title', 'Source', 'URL', 'Date', 'Sentiment_Score', 'Sentiment_Label',
517
+ 'VADER_Score', 'LM_Score', 'FinBERT_Score', 'Summary'])
518
+ for a in results.get('articles', []):
519
+ s = a.get('sentiment', {}) or {}
520
+ compound = s.get('compound', 0.0)
 
 
 
 
 
 
 
 
 
 
521
  if compound > 0.1:
522
  label = 'Positive'
523
  elif compound < -0.1:
524
  label = 'Negative'
525
  else:
526
  label = 'Neutral'
527
+ w.writerow([
528
+ a.get('title', ''),
529
+ a.get('source', ''),
530
+ a.get('url', ''),
531
+ a.get('date', ''),
 
532
  compound,
533
  label,
534
+ s.get('vader', ''),
535
+ s.get('loughran_mcdonald', ''),
536
+ s.get('finbert', ''),
537
+ (a.get('summary', '')[:200] + '...') if len(a.get('summary', '') or '') > 200 else a.get('summary', '')
538
  ])
539
+ return out.getvalue()
 
 
540
  except Exception as e:
541
+ logger.exception(f"CSV generation failed: {e}")
542
  return "Error generating CSV report"
543
 
544
+
545
  def generate_json_report(results: Dict[str, Any]) -> str:
 
546
  try:
547
  import json
548
+ meta = {
549
+ 'report_generated': datetime.now().isoformat(),
550
+ 'query': results.get('query', ''),
551
+ 'languages': results.get('languages', ['English']),
552
+ }
553
+ total, dist_counts, avg = _norm_dist_from_results(results)
554
+ summary = {
555
+ 'total_articles': total,
556
+ 'average_sentiment': avg,
557
+ 'sentiment_distribution': dist_counts,
558
+ 'top_sources': _get_top_sources(results),
559
+ }
560
  report = {
561
+ 'metadata': meta,
562
+ 'summary': summary,
 
 
 
 
 
 
 
 
 
 
 
563
  'articles': results.get('articles', []),
564
+ 'keywords': (results.get('keywords', []) or [])[:20],
565
  'analysis_methods': {
566
  'sentiment_models': ['VADER', 'Loughran-McDonald', 'FinBERT'],
567
+ 'summarization_model': 'BART/DistilBART/T5 (when available)',
568
  'keyword_extraction': 'YAKE',
569
  'translation_models': ['Helsinki-NLP Opus-MT']
570
  }
571
  }
 
572
  return json.dumps(report, indent=2, default=str, ensure_ascii=False)
 
573
  except Exception as e:
574
+ logger.exception(f"JSON generation failed: {e}")
575
+ try:
576
+ import json
577
+ return json.dumps({'error': str(e)}, indent=2)
578
+ except Exception:
579
+ return '{"error":"JSON generation failed"}'
580
+
581
 
582
  def _get_top_sources(results: Dict[str, Any]) -> List[Dict[str, Any]]:
 
583
  try:
584
+ arts = results.get('articles', []) or []
585
+ total = len(arts)
586
+ counts: Dict[str, int] = {}
587
+ for a in arts:
588
+ src = a.get('source', 'Unknown')
589
+ counts[src] = counts.get(src, 0) + 1
590
+ items = [
591
+ {'source': s, 'count': c, 'percentage': round(_safe_div(c, total) * 100.0, 1)}
592
+ for s, c in counts.items()
 
 
593
  ]
594
+ return sorted(items, key=lambda x: x['count'], reverse=True)[:10]
 
 
595
  except Exception as e:
596
+ logger.exception(f"Top sources calculation failed: {e}")
597
  return []
598
 
599
+
600
  def validate_report_data(results: Dict[str, Any]) -> bool:
601
+ """
602
+ Validate that results contain required data for reporting.
603
+ We’re lenient now: require 'articles' and 'query'.
604
+ """
605
+ if 'query' not in results or 'articles' not in results:
606
+ logger.error("Missing required keys: 'query' and/or 'articles'")
607
+ return False
 
608
  if not isinstance(results['articles'], list) or len(results['articles']) == 0:
609
  logger.error("No articles available for reporting")
610
  return False
 
611
  return True
612
 
613
+
614
  __all__ = [
615
  'generate_pdf_report',
616
+ 'generate_csv_report',
617
  'generate_json_report',
618
  'create_chart_image',
619
+ 'validate_report_data',
620
+ ]