Spaces:

wekey1998
/

news-sentiment-project

Sleeping

App Files Files Community

news-sentiment-project / report.py

wekey1998

Update report.py

25a13d5 verified 20 days ago

raw

history blame

24.7 kB

	import logging
	from typing import Dict, List, Any, Optional, Tuple
	import io
	from datetime import datetime
	import base64

	logger = logging.getLogger(__name__)

	# -------------------------------
	# Optional PDF backends
	# -------------------------------
	try:
	from reportlab.lib.pagesizes import A4
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.lib.units import inch
	from reportlab.lib import colors
	REPORTLAB_AVAILABLE = True
	except ImportError:
	REPORTLAB_AVAILABLE = False

	try:
	from fpdf import FPDF
	FPDF_AVAILABLE = True
	except ImportError:
	FPDF_AVAILABLE = False

	# Optional plotting for chart images (base64)
	try:
	import matplotlib.pyplot as plt
	import matplotlib
	matplotlib.use('Agg')
	MATPLOTLIB_AVAILABLE = True
	except ImportError:
	MATPLOTLIB_AVAILABLE = False


	# -------------------------------
	# Small helpers
	# -------------------------------
	def _safe_div(a: float, b: float) -> float:
	try:
	return (a / b) if b else 0.0
	except Exception:
	return 0.0


	def _norm_dist_from_results(results: Dict[str, Any]) -> Tuple[int, Dict[str, int], float]:
	"""
	Normalize fields from both the legacy structure and the new API structure.
	Returns:
	total_articles,
	counts dict {'Positive': int, 'Negative': int, 'Neutral': int},
	average_sentiment (float)
	"""
	# Prefer the new API shape: results["summary"]["distribution"] etc.
	articles = results.get("articles", []) or []
	total = results.get("total_articles") or len(articles) # backfill if missing

	avg = 0.0
	if "summary" in results:
	avg = results["summary"].get("average_sentiment", 0.0) or 0.0
	dist = results["summary"].get("distribution", {}) or {}
	pos = dist.get("positive") or dist.get("Positive") or 0
	neg = dist.get("negative") or dist.get("Negative") or 0
	neu = dist.get("neutral") or dist.get("Neutral") or 0
	else:
	# Legacy keys (if present)
	avg = results.get("average_sentiment", 0.0) or 0.0
	legacy = results.get("sentiment_distribution", {}) or {}
	pos = legacy.get("Positive") or legacy.get("positive") or 0
	neg = legacy.get("Negative") or legacy.get("negative") or 0
	neu = legacy.get("Neutral") or legacy.get("neutral") or 0

	# If counts are 0 but we have articles, compute from article sentiments
	if (pos + neg + neu == 0) and articles:
	for a in articles:
	c = (a.get("sentiment") or {}).get("compound", 0.0)
	if c > 0.1:
	pos += 1
	elif c < -0.1:
	neg += 1
	else:
	neu += 1

	return total, {"Positive": pos, "Negative": neg, "Neutral": neu}, float(avg)


	def _get_processing_time(results: Dict[str, Any]) -> float:
	# New structure: results["summary"]["processing"]["processing_time_seconds"]
	try:
	return float(results.get("summary", {}).get("processing", {}).get("processing_time_seconds", 0.0))
	except Exception:
	pass
	# Legacy field
	try:
	return float(results.get("processing_time", 0.0))
	except Exception:
	return 0.0


	# -------------------------------
	# Public API
	# -------------------------------
	def generate_pdf_report(results: Dict[str, Any]) -> io.BytesIO:
	"""
	Generate a comprehensive PDF report.
	Returns a BytesIO buffer so Streamlit can download directly.
	"""
	if REPORTLAB_AVAILABLE:
	try:
	return _generate_pdf_with_reportlab(results)
	except Exception as e:
	logger.exception(f"ReportLab PDF generation failed: {e}")

	# Fallback
	if FPDF_AVAILABLE:
	return _generate_simple_pdf_fallback(results)

	# Last resort: a tiny text buffer
	buf = io.BytesIO()
	buf.write(b"PDF generation is unavailable (ReportLab/FPDF not installed).")
	buf.seek(0)
	return buf


	# -------------------------------
	# ReportLab implementation
	# -------------------------------
	def _generate_pdf_with_reportlab(results: Dict[str, Any]) -> io.BytesIO:
	buffer = io.BytesIO()

	doc = SimpleDocTemplate(
	buffer,
	pagesize=A4,
	rightMargin=72,
	leftMargin=72,
	topMargin=72,
	bottomMargin=18,
	)

	styles = getSampleStyleSheet()
	title_style = ParagraphStyle(
	'CustomTitle',
	parent=styles['Heading1'],
	fontSize=22,
	spaceAfter=24,
	textColor=colors.HexColor('#2E86AB'),
	alignment=1 # Center
	)
	heading_style = ParagraphStyle(
	'CustomHeading',
	parent=styles['Heading2'],
	fontSize=14,
	spaceAfter=10,
	spaceBefore=18,
	textColor=colors.HexColor('#2E86AB')
	)

	story: List[Any] = []

	# Title
	query = results.get('query', 'N/A')
	story.append(Paragraph(f"Global Business News Intelligence Report", title_style))
	story.append(Spacer(1, 0.35 * inch))
	story.append(Paragraph(f"Analysis Target: {query}", styles['Normal']))
	story.append(Paragraph(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles['Normal']))

	total, dist_counts, avg = _norm_dist_from_results(results)
	proc_time = _get_processing_time(results)
	story.append(Paragraph(f"Total Articles Analyzed: {total}", styles['Normal']))
	story.append(Paragraph(f"Processing Time: {proc_time:.2f} seconds", styles['Normal']))
	story.append(Spacer(1, 0.25 * inch))

	# Executive Summary
	story.append(Paragraph("Executive Summary", heading_style))
	story.append(Paragraph(_create_executive_summary(query, total, avg, dist_counts), styles['Normal']))
	story.append(Spacer(1, 0.2 * inch))

	# Sentiment Analysis
	story.append(Paragraph("Sentiment Analysis", heading_style))
	story.extend(_create_sentiment_section(total, dist_counts, styles))

	# Key Stories
	story.append(Paragraph("Key Stories", heading_style))
	story.extend(_create_stories_section(results, styles))

	# Keywords
	keywords = results.get('keywords') or []
	if keywords:
	story.append(Paragraph("Key Topics and Themes", heading_style))
	story.extend(_create_keywords_section(keywords, styles))

	# Sources
	story.append(Paragraph("News Sources", heading_style))
	story.extend(_create_sources_section(results, styles))

	# Methodology
	story.append(Paragraph("Methodology", heading_style))
	story.append(Paragraph(_create_methodology_section(results, total, proc_time), styles['Normal']))

	doc.build(story)
	buffer.seek(0)
	return buffer


	def _create_executive_summary(query: str, total: int, avg_sentiment: float, dist_counts: Dict[str, int]) -> str:
	try:
	if total == 0:
	return f"No articles were available to analyze for “{query}”."

	label = "positive" if avg_sentiment > 0.1 else "negative" if avg_sentiment < -0.1 else "neutral"

	pos = dist_counts.get("Positive", 0)
	neg = dist_counts.get("Negative", 0)
	neu = dist_counts.get("Neutral", 0)

	pct_pos = _safe_div(pos, total) * 100.0
	pct_neg = _safe_div(neg, total) * 100.0
	pct_neu = _safe_div(neu, total) * 100.0

	summary = (
	f"This report analyzes {total} news articles related to “{query}”. "
	f"The overall sentiment reveals a {label} tone with an average sentiment score of {avg_sentiment:.3f}. "
	f"The analysis shows {pos} positive articles ({pct_pos:.1f}%), "
	f"{neg} negative articles ({pct_neg:.1f}%), and {neu} neutral articles ({pct_neu:.1f}%). "
	)

	if avg_sentiment > 0.2:
	summary += "Predominantly positive coverage suggests favorable market conditions or public perception."
	elif avg_sentiment < -0.2:
	summary += "Predominantly negative coverage indicates concerns or challenges that may require attention."
	else:
	summary += "Balanced coverage suggests a mixed outlook with both opportunities and challenges."
	return summary
	except Exception as e:
	logger.exception(f"Executive summary creation failed: {e}")
	return "Analysis completed successfully with comprehensive sentiment evaluation across multiple news sources."


	def _create_sentiment_section(total: int, dist_counts: Dict[str, int], styles) -> List[Any]:
	story: List[Any] = []
	try:
	pos = dist_counts.get("Positive", 0)
	neg = dist_counts.get("Negative", 0)
	neu = dist_counts.get("Neutral", 0)

	data = [
	['Sentiment', 'Count', 'Percentage'],
	['Positive', str(pos), f"{_safe_div(pos, total) * 100:.1f}%"],
	['Negative', str(neg), f"{_safe_div(neg, total) * 100:.1f}%"],
	['Neutral', str(neu), f"{_safe_div(neu, total) * 100:.1f}%"],
	]

	table = Table(data)
	table.setStyle(TableStyle([
	('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
	('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
	('ALIGN', (0, 0), (-1, -1), 'CENTER'),
	('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
	('FONTSIZE', (0, 0), (-1, 0), 12),
	('BOTTOMPADDING', (0, 0), (-1, 0), 10),
	('BACKGROUND', (0, 1), (-1, -1), colors.beige),
	('GRID', (0, 0), (-1, -1), 1, colors.black),
	]))
	story.append(table)
	story.append(Spacer(1, 0.2 * inch))

	explanation = (
	"Sentiment analysis was performed using multiple models including VADER, "
	"Loughran–McDonald (financial), and FinBERT. Scores range from -1.0 (most negative) "
	"to +1.0 (most positive), with -0.1 to +0.1 considered neutral."
	)
	story.append(Paragraph(explanation, styles['Normal']))
	story.append(Spacer(1, 0.1 * inch))
	except Exception as e:
	logger.exception(f"Sentiment section creation failed: {e}")
	story.append(Paragraph("Sentiment analysis data unavailable.", styles['Normal']))
	return story


	def _create_stories_section(results: Dict[str, Any], styles) -> List[Any]:
	story: List[Any] = []
	try:
	articles = results.get('articles', []) or []
	if not articles:
	story.append(Paragraph("No articles available for analysis.", styles['Normal']))
	return story

	# Sort by compound sentiment
	sorted_by_pos = sorted(articles, key=lambda x: (x.get('sentiment') or {}).get('compound', 0.0), reverse=True)
	sorted_by_neg = sorted(articles, key=lambda x: (x.get('sentiment') or {}).get('compound', 0.0))

	# Most positive
	if sorted_by_pos and (sorted_by_pos[0].get('sentiment') or {}).get('compound', 0.0) > 0.1:
	a = sorted_by_pos[0]
	story.append(Paragraph("Most Positive Coverage:", styles['Heading3']))
	story.append(Paragraph(f"<b>Title:</b> {a.get('title','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Source:</b> {a.get('source','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Sentiment Score:</b> {(a.get('sentiment') or {}).get('compound', 0.0):.3f}", styles['Normal']))
	if a.get('summary'):
	story.append(Paragraph(f"<b>Summary:</b> {a['summary'][:300]}{'...' if len(a['summary'])>300 else ''}", styles['Normal']))
	story.append(Spacer(1, 0.15 * inch))

	# Most negative
	if sorted_by_neg and (sorted_by_neg[0].get('sentiment') or {}).get('compound', 0.0) < -0.1:
	a = sorted_by_neg[0]
	story.append(Paragraph("Most Negative Coverage:", styles['Heading3']))
	story.append(Paragraph(f"<b>Title:</b> {a.get('title','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Source:</b> {a.get('source','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Sentiment Score:</b> {(a.get('sentiment') or {}).get('compound', 0.0):.3f}", styles['Normal']))
	if a.get('summary'):
	story.append(Paragraph(f"<b>Summary:</b> {a['summary'][:300]}{'...' if len(a['summary'])>300 else ''}", styles['Normal']))

	# Latest coverage (if dates are present)
	recent = [a for a in articles if a.get('date')]
	if recent:
	try:
	recent.sort(key=lambda x: x.get('date'), reverse=True)
	r = recent[0]
	story.append(Spacer(1, 0.15 * inch))
	story.append(Paragraph("Most Recent Coverage:", styles['Heading3']))
	story.append(Paragraph(f"<b>Title:</b> {r.get('title','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Source:</b> {r.get('source','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Date:</b> {r.get('date')}", styles['Normal']))
	story.append(Paragraph(f"<b>Sentiment Score:</b> {(r.get('sentiment') or {}).get('compound', 0.0):.3f}", styles['Normal']))
	except Exception:
	pass

	except Exception as e:
	logger.exception(f"Stories section creation failed: {e}")
	story.append(Paragraph("Story analysis data unavailable.", styles['Normal']))
	return story


	def _create_keywords_section(keywords: List[Dict[str, Any]], styles) -> List[Any]:
	story: List[Any] = []
	try:
	top = keywords[:15]
	if not top:
	story.append(Paragraph("No keywords extracted.", styles['Normal']))
	return story

	data = [['Keyword', 'Score', 'Category']]
	for kw in top:
	score = kw.get('score', 0.0)
	relevance = kw.get('relevance', 'medium')
	data.append([kw.get('keyword', 'N/A'), f"{score:.3f}", str(relevance).title()])

	table = Table(data)
	table.setStyle(TableStyle([
	('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
	('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
	('ALIGN', (0, 0), (-1, -1), 'LEFT'),
	('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
	('FONTSIZE', (0, 0), (-1, 0), 10),
	('BOTTOMPADDING', (0, 0), (-1, 0), 10),
	('BACKGROUND', (0, 1), (-1, -1), colors.beige),
	('GRID', (0, 0), (-1, -1), 1, colors.black),
	]))
	story.append(table)
	story.append(Spacer(1, 0.15 * inch))

	expl = ("Keywords were extracted using the YAKE algorithm, which identifies relevant terms and phrases "
	"based on statistical features of the text corpus.")
	story.append(Paragraph(expl, styles['Normal']))
	except Exception as e:
	logger.exception(f"Keywords section creation failed: {e}")
	story.append(Paragraph("Keyword analysis data unavailable.", styles['Normal']))
	return story


	def _create_sources_section(results: Dict[str, Any], styles) -> List[Any]:
	story: List[Any] = []
	try:
	articles = results.get('articles', []) or []
	if not articles:
	story.append(Paragraph("No source data available.", styles['Normal']))
	return story

	# Count sources
	counts: Dict[str, int] = {}
	for a in articles:
	src = a.get('source', 'Unknown')
	counts[src] = counts.get(src, 0) + 1

	total = len(articles)
	data = [['News Source', 'Article Count', 'Percentage']]
	for src, ct in sorted(counts.items(), key=lambda x: x[1], reverse=True):
	data.append([src, str(ct), f"{_safe_div(ct, total) * 100:.1f}%"])

	table = Table(data)
	table.setStyle(TableStyle([
	('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
	('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
	('ALIGN', (0, 0), (-1, -1), 'LEFT'),
	('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
	('FONTSIZE', (0, 0), (-1, 0), 10),
	('BOTTOMPADDING', (0, 0), (-1, 0), 10),
	('BACKGROUND', (0, 1), (-1, -1), colors.beige),
	('GRID', (0, 0), (-1, -1), 1, colors.black),
	]))
	story.append(table)
	story.append(Spacer(1, 0.15 * inch))

	expl = (f"Articles were collected from {len(counts)} different sources, providing diverse perspectives. "
	"Source diversity helps ensure comprehensive coverage and reduces bias.")
	story.append(Paragraph(expl, styles['Normal']))
	except Exception as e:
	logger.exception(f"Sources section creation failed: {e}")
	story.append(Paragraph("Source analysis data unavailable.", styles['Normal']))
	return story


	def _create_methodology_section(results: Dict[str, Any], total: int, proc_time: float) -> str:
	meth = (
	"This analysis employed a comprehensive NLP pipeline:\n\n"
	"1. <b>Data Collection:</b> Articles were gathered from multiple RSS/business feeds. "
	"Content was filtered for relevance and deduplicated.\n\n"
	"2. <b>Sentiment Analysis:</b> VADER (general), Loughran–McDonald (finance), and FinBERT (finance) were combined. "
	"Final scores reflect a weighted composite.\n\n"
	"3. <b>Summarization & Keywords:</b> Articles were cleaned and summarized (transformer models when available), "
	"and key themes extracted with YAKE.\n\n"
	"4. <b>Quality Controls:</b> English-only filtering, minimum length checks, and relevance filters.\n\n"
	)
	try:
	meth += f"Processed {total} articles in {proc_time:.2f} seconds."
	except Exception:
	pass
	return meth


	# -------------------------------
	# FPDF fallback
	# -------------------------------
	def _generate_simple_pdf_fallback(results: Dict[str, Any]) -> io.BytesIO:
	total, dist_counts, avg = _norm_dist_from_results(results)
	query = results.get('query', 'N/A')

	pdf = FPDF()
	pdf.add_page()
	pdf.set_font('Arial', 'B', 16)
	pdf.cell(0, 10, 'News Analysis Report', ln=True)
	pdf.ln(5)

	pdf.set_font('Arial', '', 12)
	pdf.cell(0, 8, f"Query: {query}", ln=True)
	pdf.cell(0, 8, f"Articles: {total}", ln=True)
	pdf.cell(0, 8, f"Average Sentiment: {avg:.3f}", ln=True)
	pdf.ln(5)

	pos, neg, neu = dist_counts.get("Positive", 0), dist_counts.get("Negative", 0), dist_counts.get("Neutral", 0)
	pdf.cell(0, 8, "Sentiment Distribution:", ln=True)
	pdf.cell(0, 8, f" Positive: {pos} ({_safe_div(pos, total)*100:.1f}%)", ln=True)
	pdf.cell(0, 8, f" Negative: {neg} ({_safe_div(neg, total)*100:.1f}%)", ln=True)
	pdf.cell(0, 8, f" Neutral: {neu} ({_safe_div(neu, total)*100:.1f}%)", ln=True)

	buf = io.BytesIO()
	pdf_bytes = pdf.output(dest='S').encode('latin1')
	buf.write(pdf_bytes)
	buf.seek(0)
	return buf


	# -------------------------------
	# Optional chart image (base64)
	# -------------------------------
	def create_chart_image(data: Dict, chart_type: str = 'pie') -> Optional[str]:
	if not MATPLOTLIB_AVAILABLE:
	return None
	try:
	plt.figure(figsize=(6, 4))
	if chart_type == 'pie':
	# Support both shapes
	total, dist_counts, _ = _norm_dist_from_results(data if 'articles' in data else {'summary': {'distribution': data}})
	labels = ['Positive', 'Negative', 'Neutral']
	sizes = [
	dist_counts.get('Positive', 0),
	dist_counts.get('Negative', 0),
	dist_counts.get('Neutral', 0),
	]
	plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
	plt.title('Sentiment Distribution')
	elif chart_type == 'bar' and 'articles' in data:
	sources: Dict[str, int] = {}
	for a in data.get('articles', []):
	s = a.get('source', 'Unknown')
	sources[s] = sources.get(s, 0) + 1
	top = dict(sorted(sources.items(), key=lambda x: x[1], reverse=True)[:10])
	plt.bar(range(len(top)), list(top.values()))
	plt.xticks(range(len(top)), list(top.keys()), rotation=45, ha='right')
	plt.title('Articles by Source')
	plt.ylabel('Count')
	plt.tight_layout()

	buf = io.BytesIO()
	plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
	buf.seek(0)
	img64 = base64.b64encode(buf.getvalue()).decode()
	plt.close()
	return img64
	except Exception as e:
	logger.exception(f"Chart creation failed: {e}")
	return None


	# -------------------------------
	# CSV / JSON helpers (unchanged public API)
	# -------------------------------
	def generate_csv_report(results: Dict[str, Any]) -> str:
	try:
	import csv
	import io as _io
	out = _io.StringIO()
	w = csv.writer(out)
	w.writerow(['Title', 'Source', 'URL', 'Date', 'Sentiment_Score', 'Sentiment_Label',
	'VADER_Score', 'LM_Score', 'FinBERT_Score', 'Summary'])
	for a in results.get('articles', []):
	s = a.get('sentiment', {}) or {}
	compound = s.get('compound', 0.0)
	if compound > 0.1:
	label = 'Positive'
	elif compound < -0.1:
	label = 'Negative'
	else:
	label = 'Neutral'
	w.writerow([
	a.get('title', ''),
	a.get('source', ''),
	a.get('url', ''),
	a.get('date', ''),
	compound,
	label,
	s.get('vader', ''),
	s.get('loughran_mcdonald', ''),
	s.get('finbert', ''),
	(a.get('summary', '')[:200] + '...') if len(a.get('summary', '') or '') > 200 else a.get('summary', '')
	])
	return out.getvalue()
	except Exception as e:
	logger.exception(f"CSV generation failed: {e}")
	return "Error generating CSV report"


	def generate_json_report(results: Dict[str, Any]) -> str:
	try:
	import json
	meta = {
	'report_generated': datetime.now().isoformat(),
	'query': results.get('query', ''),
	'languages': results.get('languages', ['English']),
	}
	total, dist_counts, avg = _norm_dist_from_results(results)
	summary = {
	'total_articles': total,
	'average_sentiment': avg,
	'sentiment_distribution': dist_counts,
	'top_sources': _get_top_sources(results),
	}
	report = {
	'metadata': meta,
	'summary': summary,
	'articles': results.get('articles', []),
	'keywords': (results.get('keywords', []) or [])[:20],
	'analysis_methods': {
	'sentiment_models': ['VADER', 'Loughran-McDonald', 'FinBERT'],
	'summarization_model': 'BART/DistilBART/T5 (when available)',
	'keyword_extraction': 'YAKE',
	'translation_models': ['Helsinki-NLP Opus-MT']
	}
	}
	return json.dumps(report, indent=2, default=str, ensure_ascii=False)
	except Exception as e:
	logger.exception(f"JSON generation failed: {e}")
	try:
	import json
	return json.dumps({'error': str(e)}, indent=2)
	except Exception:
	return '{"error":"JSON generation failed"}'


	def _get_top_sources(results: Dict[str, Any]) -> List[Dict[str, Any]]:
	try:
	arts = results.get('articles', []) or []
	total = len(arts)
	counts: Dict[str, int] = {}
	for a in arts:
	src = a.get('source', 'Unknown')
	counts[src] = counts.get(src, 0) + 1
	items = [
	{'source': s, 'count': c, 'percentage': round(_safe_div(c, total) * 100.0, 1)}
	for s, c in counts.items()
	]
	return sorted(items, key=lambda x: x['count'], reverse=True)[:10]
	except Exception as e:
	logger.exception(f"Top sources calculation failed: {e}")
	return []


	def validate_report_data(results: Dict[str, Any]) -> bool:
	"""
	Validate that results contain required data for reporting.
	We’re lenient now: require 'articles' and 'query'.
	"""
	if 'query' not in results or 'articles' not in results:
	logger.error("Missing required keys: 'query' and/or 'articles'")
	return False
	if not isinstance(results['articles'], list) or len(results['articles']) == 0:
	logger.error("No articles available for reporting")
	return False
	return True


	__all__ = [
	'generate_pdf_report',
	'generate_csv_report',
	'generate_json_report',
	'create_chart_image',
	'validate_report_data',
	]