Spaces:
Sleeping
Sleeping
import logging | |
from typing import Dict, List, Any, Optional | |
import io | |
from datetime import datetime | |
import base64 | |
# PDF generation | |
try: | |
from reportlab.lib.pagesizes import letter, A4 | |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image | |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
from reportlab.lib.units import inch | |
from reportlab.lib import colors | |
from reportlab.graphics.shapes import Drawing | |
from reportlab.graphics.charts.piecharts import Pie | |
from reportlab.graphics.charts.barcharts import VerticalBarChart | |
REPORTLAB_AVAILABLE = True | |
except ImportError: | |
REPORTLAB_AVAILABLE = False | |
# Plotting for charts in PDF | |
try: | |
import matplotlib.pyplot as plt | |
import matplotlib | |
matplotlib.use('Agg') # Use non-interactive backend | |
MATPLOTLIB_AVAILABLE = True | |
except ImportError: | |
MATPLOTLIB_AVAILABLE = False | |
logger = logging.getLogger(__name__) | |
def generate_pdf_report(results: Dict[str, Any]) -> io.BytesIO: | |
"""Generate a comprehensive PDF report""" | |
if not REPORTLAB_AVAILABLE: | |
logger.error("ReportLab not available for PDF generation") | |
return _generate_simple_pdf_fallback(results) | |
try: | |
# Create PDF buffer | |
buffer = io.BytesIO() | |
# Create document | |
doc = SimpleDocTemplate( | |
buffer, | |
pagesize=A4, | |
rightMargin=72, | |
leftMargin=72, | |
topMargin=72, | |
bottomMargin=18 | |
) | |
# Get styles | |
styles = getSampleStyleSheet() | |
# Create custom styles | |
title_style = ParagraphStyle( | |
'CustomTitle', | |
parent=styles['Heading1'], | |
fontSize=24, | |
spaceAfter=30, | |
textColor=colors.HexColor('#2E86AB'), | |
alignment=1 # Center | |
) | |
heading_style = ParagraphStyle( | |
'CustomHeading', | |
parent=styles['Heading2'], | |
fontSize=16, | |
spaceAfter=12, | |
spaceBefore=20, | |
textColor=colors.HexColor('#2E86AB') | |
) | |
# Build story (content) | |
story = [] | |
# Title page | |
story.append(Paragraph("Global Business News Intelligence Report", title_style)) | |
story.append(Spacer(1, 0.5*inch)) | |
# Query and basic info | |
story.append(Paragraph(f"Analysis Target: {results.get('query', 'N/A')}", styles['Normal'])) | |
story.append(Paragraph(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles['Normal'])) | |
story.append(Paragraph(f"Total Articles Analyzed: {results.get('total_articles', 0)}", styles['Normal'])) | |
story.append(Paragraph(f"Processing Time: {results.get('processing_time', 0):.2f} seconds", styles['Normal'])) | |
story.append(Spacer(1, 0.3*inch)) | |
# Executive Summary | |
story.append(Paragraph("Executive Summary", heading_style)) | |
summary_text = _create_executive_summary(results) | |
story.append(Paragraph(summary_text, styles['Normal'])) | |
story.append(Spacer(1, 0.2*inch)) | |
# Sentiment Analysis Section | |
story.append(Paragraph("Sentiment Analysis", heading_style)) | |
sentiment_data = _create_sentiment_section(results, styles) | |
story.extend(sentiment_data) | |
# Top Stories Section | |
story.append(Paragraph("Key Stories", heading_style)) | |
stories_data = _create_stories_section(results, styles) | |
story.extend(stories_data) | |
# Keywords Section | |
if 'keywords' in results and results['keywords']: | |
story.append(Paragraph("Key Topics and Themes", heading_style)) | |
keywords_data = _create_keywords_section(results, styles) | |
story.extend(keywords_data) | |
# Sources Section | |
story.append(Paragraph("News Sources", heading_style)) | |
sources_data = _create_sources_section(results, styles) | |
story.extend(sources_data) | |
# Methodology Section | |
story.append(Paragraph("Methodology", heading_style)) | |
methodology_text = _create_methodology_section(results) | |
story.append(Paragraph(methodology_text, styles['Normal'])) | |
# Build PDF | |
doc.build(story) | |
buffer.seek(0) | |
return buffer | |
except Exception as e: | |
logger.error(f"PDF generation failed: {str(e)}") | |
return _generate_simple_pdf_fallback(results) | |
def _create_executive_summary(results: Dict[str, Any]) -> str: | |
"""Create executive summary text""" | |
try: | |
query = results.get('query', 'the analyzed topic') | |
total_articles = results.get('total_articles', 0) | |
avg_sentiment = results.get('average_sentiment', 0) | |
sentiment_label = "positive" if avg_sentiment > 0.1 else "negative" if avg_sentiment < -0.1 else "neutral" | |
summary = f"This report analyzes {total_articles} news articles related to {query}. " | |
summary += f"The overall sentiment analysis reveals a {sentiment_label} tone with an average sentiment score of {avg_sentiment:.3f}. " | |
# Add sentiment distribution | |
dist = results.get('sentiment_distribution', {}) | |
positive = dist.get('Positive', 0) | |
negative = dist.get('Negative', 0) | |
neutral = dist.get('Neutral', 0) | |
summary += f"The analysis shows {positive} positive articles ({positive/total_articles*100:.1f}%), " | |
summary += f"{negative} negative articles ({negative/total_articles*100:.1f}%), " | |
summary += f"and {neutral} neutral articles ({neutral/total_articles*100:.1f}%). " | |
# Add key insights | |
if avg_sentiment > 0.2: | |
summary += "The predominantly positive coverage suggests favorable market conditions or public perception." | |
elif avg_sentiment < -0.2: | |
summary += "The predominantly negative coverage indicates concerns or challenges that may require attention." | |
else: | |
summary += "The balanced sentiment coverage suggests a mixed outlook with both opportunities and challenges present." | |
return summary | |
except Exception as e: | |
logger.error(f"Executive summary creation failed: {str(e)}") | |
return "Analysis completed successfully with comprehensive sentiment evaluation across multiple news sources." | |
def _create_sentiment_section(results: Dict[str, Any], styles) -> List: | |
"""Create sentiment analysis section""" | |
story = [] | |
try: | |
# Sentiment distribution table | |
dist = results.get('sentiment_distribution', {}) | |
sentiment_data = [ | |
['Sentiment', 'Count', 'Percentage'], | |
['Positive', str(dist.get('Positive', 0)), f"{dist.get('Positive', 0)/results.get('total_articles', 1)*100:.1f}%"], | |
['Negative', str(dist.get('Negative', 0)), f"{dist.get('Negative', 0)/results.get('total_articles', 1)*100:.1f}%"], | |
['Neutral', str(dist.get('Neutral', 0)), f"{dist.get('Neutral', 0)/results.get('total_articles', 1)*100:.1f}%"] | |
] | |
sentiment_table = Table(sentiment_data) | |
sentiment_table.setStyle(TableStyle([ | |
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')), | |
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), | |
('ALIGN', (0, 0), (-1, -1), 'CENTER'), | |
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), | |
('FONTSIZE', (0, 0), (-1, 0), 12), | |
('BOTTOMPADDING', (0, 0), (-1, 0), 12), | |
('BACKGROUND', (0, 1), (-1, -1), colors.beige), | |
('GRID', (0, 0), (-1, -1), 1, colors.black) | |
])) | |
story.append(sentiment_table) | |
story.append(Spacer(1, 0.2*inch)) | |
# Add sentiment analysis explanation | |
explanation = "Sentiment analysis was performed using multiple models including VADER, Loughran-McDonald financial dictionary, and FinBERT. " | |
explanation += "Scores range from -1.0 (most negative) to +1.0 (most positive), with scores between -0.1 and +0.1 considered neutral." | |
story.append(Paragraph(explanation, styles['Normal'])) | |
story.append(Spacer(1, 0.2*inch)) | |
except Exception as e: | |
logger.error(f"Sentiment section creation failed: {str(e)}") | |
story.append(Paragraph("Sentiment analysis data unavailable.", styles['Normal'])) | |
return story | |
def _create_stories_section(results: Dict[str, Any], styles) -> List: | |
"""Create top stories section""" | |
story = [] | |
try: | |
articles = results.get('articles', []) | |
if not articles: | |
story.append(Paragraph("No articles available for analysis.", styles['Normal'])) | |
return story | |
# Sort articles by sentiment score | |
sorted_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0), reverse=True) | |
# Most positive story | |
if sorted_articles and sorted_articles[0].get('sentiment', {}).get('compound', 0) > 0.1: | |
story.append(Paragraph("Most Positive Coverage:", styles['Heading3'])) | |
top_positive = sorted_articles[0] | |
story.append(Paragraph(f"<b>Title:</b> {top_positive.get('title', 'N/A')}", styles['Normal'])) | |
story.append(Paragraph(f"<b>Source:</b> {top_positive.get('source', 'N/A')}", styles['Normal'])) | |
story.append(Paragraph(f"<b>Sentiment Score:</b> {top_positive.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal'])) | |
if 'summary' in top_positive: | |
story.append(Paragraph(f"<b>Summary:</b> {top_positive['summary'][:300]}...", styles['Normal'])) | |
story.append(Spacer(1, 0.2*inch)) | |
# Most negative story | |
negative_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0)) | |
if negative_articles and negative_articles[0].get('sentiment', {}).get('compound', 0) < -0.1: | |
story.append(Paragraph("Most Negative Coverage:", styles['Heading3'])) | |
top_negative = negative_articles[0] | |
story.append(Paragraph(f"<b>Title:</b> {top_negative.get('title', 'N/A')}", styles['Normal'])) | |
story.append(Paragraph(f"<b>Source:</b> {top_negative.get('source', 'N/A')}", styles['Normal'])) | |
story.append(Paragraph(f"<b>Sentiment Score:</b> {top_negative.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal'])) | |
if 'summary' in top_negative: | |
story.append(Paragraph(f"<b>Summary:</b> {top_negative['summary'][:300]}...", styles['Normal'])) | |
story.append(Spacer(1, 0.2*inch)) | |
# Recent stories (if dates available) | |
recent_articles = [a for a in articles if a.get('date')] | |
if recent_articles: | |
recent_articles.sort(key=lambda x: x.get('date', ''), reverse=True) | |
story.append(Paragraph("Most Recent Coverage:", styles['Heading3'])) | |
recent = recent_articles[0] | |
story.append(Paragraph(f"<b>Title:</b> {recent.get('title', 'N/A')}", styles['Normal'])) | |
story.append(Paragraph(f"<b>Source:</b> {recent.get('source', 'N/A')}", styles['Normal'])) | |
story.append(Paragraph(f"<b>Date:</b> {recent.get('date', 'N/A')}", styles['Normal'])) | |
story.append(Paragraph(f"<b>Sentiment Score:</b> {recent.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal'])) | |
except Exception as e: | |
logger.error(f"Stories section creation failed: {str(e)}") | |
story.append(Paragraph("Story analysis data unavailable.", styles['Normal'])) | |
return story | |
def _create_keywords_section(results: Dict[str, Any], styles) -> List: | |
"""Create keywords section""" | |
story = [] | |
try: | |
keywords = results.get('keywords', [])[:15] # Top 15 keywords | |
if not keywords: | |
story.append(Paragraph("No keywords extracted.", styles['Normal'])) | |
return story | |
# Create keywords table | |
keyword_data = [['Keyword', 'Relevance Score', 'Category']] | |
for kw in keywords: | |
relevance = kw.get('relevance', 'medium') | |
score = kw.get('score', 0) | |
keyword_data.append([ | |
kw.get('keyword', 'N/A'), | |
f"{score:.3f}", | |
relevance.title() | |
]) | |
keyword_table = Table(keyword_data) | |
keyword_table.setStyle(TableStyle([ | |
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')), | |
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), | |
('ALIGN', (0, 0), (-1, -1), 'LEFT'), | |
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), | |
('FONTSIZE', (0, 0), (-1, 0), 10), | |
('BOTTOMPADDING', (0, 0), (-1, 0), 12), | |
('BACKGROUND', (0, 1), (-1, -1), colors.beige), | |
('GRID', (0, 0), (-1, -1), 1, colors.black) | |
])) | |
story.append(keyword_table) | |
story.append(Spacer(1, 0.2*inch)) | |
# Keywords explanation | |
explanation = "Keywords were extracted using the YAKE (Yet Another Keyword Extractor) algorithm, " | |
explanation += "which identifies the most relevant terms and phrases based on statistical analysis of the text corpus." | |
story.append(Paragraph(explanation, styles['Normal'])) | |
except Exception as e: | |
logger.error(f"Keywords section creation failed: {str(e)}") | |
story.append(Paragraph("Keyword analysis data unavailable.", styles['Normal'])) | |
return story | |
def _create_sources_section(results: Dict[str, Any], styles) -> List: | |
"""Create news sources section""" | |
story = [] | |
try: | |
articles = results.get('articles', []) | |
if not articles: | |
story.append(Paragraph("No source data available.", styles['Normal'])) | |
return story | |
# Count sources | |
source_counts = {} | |
for article in articles: | |
source = article.get('source', 'Unknown') | |
source_counts[source] = source_counts.get(source, 0) + 1 | |
# Create sources table | |
source_data = [['News Source', 'Article Count', 'Percentage']] | |
total_articles = len(articles) | |
for source, count in sorted(source_counts.items(), key=lambda x: x[1], reverse=True): | |
percentage = (count / total_articles) * 100 | |
source_data.append([source, str(count), f"{percentage:.1f}%"]) | |
sources_table = Table(source_data) | |
sources_table.setStyle(TableStyle([ | |
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')), | |
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), | |
('ALIGN', (0, 0), (-1, -1), 'LEFT'), | |
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), | |
('FONTSIZE', (0, 0), (-1, 0), 10), | |
('BOTTOMPADDING', (0, 0), (-1, 0), 12), | |
('BACKGROUND', (0, 1), (-1, -1), colors.beige), | |
('GRID', (0, 0), (-1, -1), 1, colors.black) | |
])) | |
story.append(sources_table) | |
story.append(Spacer(1, 0.2*inch)) | |
# Sources explanation | |
explanation = f"Articles were collected from {len(source_counts)} different news sources, " | |
explanation += "providing diverse perspectives on the analyzed topic. Source diversity helps ensure comprehensive coverage and reduces bias." | |
story.append(Paragraph(explanation, styles['Normal'])) | |
except Exception as e: | |
logger.error(f"Sources section creation failed: {str(e)}") | |
story.append(Paragraph("Source analysis data unavailable.", styles['Normal'])) | |
return story | |
def _create_methodology_section(results: Dict[str, Any]) -> str: | |
"""Create methodology section text""" | |
methodology = "This analysis employed a comprehensive natural language processing pipeline:\n\n" | |
methodology += "1. <b>Data Collection:</b> News articles were scraped from multiple reliable sources using RSS feeds and web scraping techniques. " | |
methodology += "Content was filtered for relevance and deduplicated to ensure quality.\n\n" | |
methodology += "2. <b>Sentiment Analysis:</b> Three complementary models were used: " | |
methodology += "VADER (general sentiment), Loughran-McDonald dictionary (financial sentiment), and FinBERT (financial domain-specific). " | |
methodology += "Final scores represent a weighted combination of all models.\n\n" | |
methodology += "3. <b>Text Processing:</b> Articles were cleaned, summarized using transformer models, and analyzed for key themes. " | |
methodology += "Keyword extraction employed the YAKE algorithm for statistical relevance.\n\n" | |
methodology += "4. <b>Quality Assurance:</b> All content was filtered for English language, minimum length requirements, and relevance to the query terms. " | |
methodology += "Results were validated across multiple model outputs for consistency.\n\n" | |
if results.get('processing_time'): | |
methodology += f"Total processing time: {results['processing_time']:.2f} seconds for {results.get('total_articles', 0)} articles." | |
return methodology | |
def _generate_simple_pdf_fallback(results: Dict[str, Any]) -> io.BytesIO: | |
"""Generate a simple text-based PDF fallback""" | |
try: | |
from fpdf import FPDF | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font('Arial', 'B', 16) | |
pdf.cell(40, 10, 'News Analysis Report') | |
pdf.ln(20) | |
pdf.set_font('Arial', '', 12) | |
pdf.cell(40, 10, f"Query: {results.get('query', 'N/A')}") | |
pdf.ln(10) | |
pdf.cell(40, 10, f"Articles: {results.get('total_articles', 0)}") | |
pdf.ln(10) | |
pdf.cell(40, 10, f"Average Sentiment: {results.get('average_sentiment', 0):.3f}") | |
pdf.ln(20) | |
# Simple sentiment distribution | |
dist = results.get('sentiment_distribution', {}) | |
pdf.cell(40, 10, 'Sentiment Distribution:') | |
pdf.ln(10) | |
pdf.cell(40, 10, f"Positive: {dist.get('Positive', 0)}") | |
pdf.ln(10) | |
pdf.cell(40, 10, f"Negative: {dist.get('Negative', 0)}") | |
pdf.ln(10) | |
pdf.cell(40, 10, f"Neutral: {dist.get('Neutral', 0)}") | |
# Save to buffer | |
buffer = io.BytesIO() | |
pdf_string = pdf.output(dest='S').encode('latin1') | |
buffer.write(pdf_string) | |
buffer.seek(0) | |
return buffer | |
except Exception as e: | |
logger.error(f"PDF fallback failed: {str(e)}") | |
# Return empty buffer as last resort | |
buffer = io.BytesIO() | |
buffer.write(b"PDF generation failed. Please check logs.") | |
buffer.seek(0) | |
return buffer | |
def create_chart_image(data: Dict, chart_type: str = 'pie') -> Optional[str]: | |
"""Create a chart image for PDF inclusion""" | |
if not MATPLOTLIB_AVAILABLE: | |
return None | |
try: | |
plt.figure(figsize=(6, 4)) | |
if chart_type == 'pie' and 'sentiment_distribution' in data: | |
dist = data['sentiment_distribution'] | |
labels = ['Positive', 'Negative', 'Neutral'] | |
sizes = [dist.get('Positive', 0), dist.get('Negative', 0), dist.get('Neutral', 0)] | |
colors = ['#28a745', '#dc3545', '#6c757d'] | |
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90) | |
plt.title('Sentiment Distribution') | |
elif chart_type == 'bar' and 'articles' in data: | |
articles = data['articles'] | |
sources = {} | |
for article in articles: | |
source = article.get('source', 'Unknown') | |
sources[source] = sources.get(source, 0) + 1 | |
# Top 10 sources | |
top_sources = dict(sorted(sources.items(), key=lambda x: x[1], reverse=True)[:10]) | |
plt.bar(range(len(top_sources)), list(top_sources.values()), color='#2E86AB') | |
plt.xticks(range(len(top_sources)), list(top_sources.keys()), rotation=45, ha='right') | |
plt.title('Articles by Source') | |
plt.ylabel('Article Count') | |
plt.tight_layout() | |
# Save to base64 string | |
buffer = io.BytesIO() | |
plt.savefig(buffer, format='png', dpi=150, bbox_inches='tight') | |
buffer.seek(0) | |
image_base64 = base64.b64encode(buffer.getvalue()).decode() | |
plt.close() | |
return image_base64 | |
except Exception as e: | |
logger.error(f"Chart creation failed: {str(e)}") | |
return None | |
def generate_csv_report(results: Dict[str, Any]) -> str: | |
"""Generate CSV report""" | |
try: | |
import csv | |
import io | |
output = io.StringIO() | |
writer = csv.writer(output) | |
# Write header | |
writer.writerow([ | |
'Title', 'Source', 'URL', 'Date', 'Sentiment_Score', 'Sentiment_Label', | |
'VADER_Score', 'LM_Score', 'FinBERT_Score', 'Summary' | |
]) | |
# Write article data | |
articles = results.get('articles', []) | |
for article in articles: | |
sentiment = article.get('sentiment', {}) | |
compound = sentiment.get('compound', 0) | |
# Determine sentiment label | |
if compound > 0.1: | |
label = 'Positive' | |
elif compound < -0.1: | |
label = 'Negative' | |
else: | |
label = 'Neutral' | |
writer.writerow([ | |
article.get('title', ''), | |
article.get('source', ''), | |
article.get('url', ''), | |
article.get('date', ''), | |
compound, | |
label, | |
sentiment.get('vader', ''), | |
sentiment.get('loughran_mcdonald', ''), | |
sentiment.get('finbert', ''), | |
article.get('summary', '')[:200] + '...' if len(article.get('summary', '')) > 200 else article.get('summary', '') | |
]) | |
return output.getvalue() | |
except Exception as e: | |
logger.error(f"CSV generation failed: {str(e)}") | |
return "Error generating CSV report" | |
def generate_json_report(results: Dict[str, Any]) -> str: | |
"""Generate JSON report with formatted output""" | |
try: | |
import json | |
from datetime import datetime | |
# Create comprehensive report | |
report = { | |
'metadata': { | |
'report_generated': datetime.now().isoformat(), | |
'query': results.get('query', ''), | |
'total_articles': results.get('total_articles', 0), | |
'processing_time_seconds': results.get('processing_time', 0), | |
'languages': results.get('languages', ['English']) | |
}, | |
'summary': { | |
'average_sentiment': results.get('average_sentiment', 0), | |
'sentiment_distribution': results.get('sentiment_distribution', {}), | |
'top_sources': _get_top_sources(results), | |
'date_range': results.get('summary', {}).get('date_range', {}) | |
}, | |
'articles': results.get('articles', []), | |
'keywords': results.get('keywords', [])[:20], # Top 20 keywords | |
'analysis_methods': { | |
'sentiment_models': ['VADER', 'Loughran-McDonald', 'FinBERT'], | |
'summarization_model': 'DistilBART', | |
'keyword_extraction': 'YAKE', | |
'translation_models': ['Helsinki-NLP Opus-MT'] | |
} | |
} | |
return json.dumps(report, indent=2, default=str, ensure_ascii=False) | |
except Exception as e: | |
logger.error(f"JSON generation failed: {str(e)}") | |
return json.dumps({'error': str(e)}, indent=2) | |
def _get_top_sources(results: Dict[str, Any]) -> List[Dict[str, Any]]: | |
"""Get top news sources from results""" | |
try: | |
articles = results.get('articles', []) | |
sources = {} | |
for article in articles: | |
source = article.get('source', 'Unknown') | |
sources[source] = sources.get(source, 0) + 1 | |
# Convert to list and sort | |
source_list = [ | |
{'source': source, 'count': count, 'percentage': round((count / len(articles)) * 100, 1)} | |
for source, count in sources.items() | |
] | |
return sorted(source_list, key=lambda x: x['count'], reverse=True)[:10] | |
except Exception as e: | |
logger.error(f"Top sources calculation failed: {str(e)}") | |
return [] | |
def validate_report_data(results: Dict[str, Any]) -> bool: | |
"""Validate that results contain required data for reporting""" | |
required_keys = ['query', 'articles', 'total_articles'] | |
for key in required_keys: | |
if key not in results: | |
logger.error(f"Missing required key for reporting: {key}") | |
return False | |
if not isinstance(results['articles'], list) or len(results['articles']) == 0: | |
logger.error("No articles available for reporting") | |
return False | |
return True | |
# Export functions | |
__all__ = [ | |
'generate_pdf_report', | |
'generate_csv_report', | |
'generate_json_report', | |
'create_chart_image', | |
'validate_report_data' | |
] |