Spaces:

wekey1998
/

news-sentiment-project

Running

App Files Files Community

news-sentiment-project / report.py

wekey1998

Update report.py

25a13d5 verified 4 days ago

raw

history blame contribute delete

24.7 kB

	import logging
	from typing import Dict, List, Any, Optional, Tuple
	import io
	from datetime import datetime
	import base64

	logger = logging.getLogger(__name__)

	# -------------------------------
	# Optional PDF backends
	# -------------------------------
	try:
	from reportlab.lib.pagesizes import A4
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.lib.units import inch
	from reportlab.lib import colors
	REPORTLAB_AVAILABLE = True
	except ImportError:
	REPORTLAB_AVAILABLE = False

	try:
	from fpdf import FPDF
	FPDF_AVAILABLE = True
	except ImportError:
	FPDF_AVAILABLE = False

	# Optional plotting for chart images (base64)
	try:
	import matplotlib.pyplot as plt
	import matplotlib
	matplotlib.use('Agg')
	MATPLOTLIB_AVAILABLE = True
	except ImportError:
	MATPLOTLIB_AVAILABLE = False


	# -------------------------------
	# Small helpers
	# -------------------------------
	def _safe_div(a: float, b: float) -> float:
	try:
	return (a / b) if b else 0.0
	except Exception:
	return 0.0


	def _norm_dist_from_results(results: Dict[str, Any]) -> Tuple[int, Dict[str, int], float]:
	"""
	Normalize fields from both the legacy structure and the new API structure.
	Returns:
	total_articles,
	counts dict {'Positive': int, 'Negative': int, 'Neutral': int},
	average_sentiment (float)
	"""
	# Prefer the new API shape: results["summary"]["distribution"] etc.
	articles = results.get("articles", []) or []
	total = results.get("total_articles") or len(articles) # backfill if missing

	avg = 0.0
	if "summary" in results:
	avg = results["summary"].get("average_sentiment", 0.0) or 0.0
	dist = results["summary"].get("distribution", {}) or {}
	pos = dist.get("positive") or dist.get("Positive") or 0
	neg = dist.get("negative") or dist.get("Negative") or 0
	neu = dist.get("neutral") or dist.get("Neutral") or 0
	else:
	# Legacy keys (if present)
	avg = results.get("average_sentiment", 0.0) or 0.0
	legacy = results.get("sentiment_distribution", {}) or {}
	pos = legacy.get("Positive") or legacy.get("positive") or 0
	neg = legacy.get("Negative") or legacy.get("negative") or 0
	neu = legacy.get("Neutral") or legacy.get("neutral") or 0

	# If counts are 0 but we have articles, compute from article sentiments
	if (pos + neg + neu == 0) and articles:
	for a in articles:
	c = (a.get("sentiment") or {}).get("compound", 0.0)
	if c > 0.1:
	pos += 1
	elif c < -0.1:
	neg += 1
	else:
	neu += 1

	return total, {"Positive": pos, "Negative": neg, "Neutral": neu}, float(avg)


	def _get_processing_time(results: Dict[str, Any]) -> float:
	# New structure: results["summary"]["processing"]["processing_time_seconds"]
	try:
	return float(results.get("summary", {}).get("processing", {}).get("processing_time_seconds", 0.0))
	except Exception:
	pass
	# Legacy field
	try:
	return float(results.get("processing_time", 0.0))
	except Exception:
	return 0.0


	# -------------------------------
	# Public API
	# -------------------------------
	def generate_pdf_report(results: Dict[str, Any]) -> io.BytesIO:
	"""
	Generate a comprehensive PDF report.
	Returns a BytesIO buffer so Streamlit can download directly.
	"""
	if REPORTLAB_AVAILABLE:
	try:
	return _generate_pdf_with_reportlab(results)
	except Exception as e:
	logger.exception(f"ReportLab PDF generation failed: {e}")

	# Fallback
	if FPDF_AVAILABLE:
	return _generate_simple_pdf_fallback(results)

	# Last resort: a tiny text buffer
	buf = io.BytesIO()
	buf.write(b"PDF generation is unavailable (ReportLab/FPDF not installed).")
	buf.seek(0)
	return buf


	# -------------------------------
	# ReportLab implementation
	# -------------------------------
	def _generate_pdf_with_reportlab(results: Dict[str, Any]) -> io.BytesIO:
	buffer = io.BytesIO()

	doc = SimpleDocTemplate(
	buffer,
	pagesize=A4,
	rightMargin=72,
	leftMargin=72,
	topMargin=72,
	bottomMargin=18,
	)

	styles = getSampleStyleSheet()
	title_style = ParagraphStyle(
	'CustomTitle',
	parent=styles['Heading1'],
	fontSize=22,
	spaceAfter=24,
	textColor=colors.HexColor('#2E86AB'),
	alignment=1 # Center
	)
	heading_style = ParagraphStyle(
	'CustomHeading',
	parent=styles['Heading2'],
	fontSize=14,
	spaceAfter=10,
	spaceBefore=18,
	textColor=colors.HexColor('#2E86AB')
	)

	story: List[Any] = []

	# Title
	query = results.get('query', 'N/A')
	story.append(Paragraph(f"Global Business News Intelligence Report", title_style))
	story.append(Spacer(1, 0.35 * inch))
	story.append(Paragraph(f"Analysis Target: {query}", styles['Normal']))
	story.append(Paragraph(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles['Normal']))

	total, dist_counts, avg = _norm_dist_from_results(results)
	proc_time = _get_processing_time(results)
	story.append(Paragraph(f"Total Articles Analyzed: {total}", styles['Normal']))
	story.append(Paragraph(f"Processing Time: {proc_time:.2f} seconds", styles['Normal']))
	story.append(Spacer(1, 0.25 * inch))

	# Executive Summary
	story.append(Paragraph("Executive Summary", heading_style))
	story.append(Paragraph(_create_executive_summary(query, total, avg, dist_counts), styles['Normal']))
	story.append(Spacer(1, 0.2 * inch))

	# Sentiment Analysis
	story.append(Paragraph("Sentiment Analysis", heading_style))
	story.extend(_create_sentiment_section(total, dist_counts, styles))

	# Key Stories
	story.append(Paragraph("Key Stories", heading_style))
	story.extend(_create_stories_section(results, styles))

	# Keywords
	keywords = results.get('keywords') or []
	if keywords:
	story.append(Paragraph("Key Topics and Themes", heading_style))
	story.extend(_create_keywords_section(keywords, styles))

	# Sources
	story.append(Paragraph("News Sources", heading_style))
	story.extend(_create_sources_section(results, styles))

	# Methodology
	story.append(Paragraph("Methodology", heading_style))
	story.append(Paragraph(_create_methodology_section(results, total, proc_time), styles['Normal']))

	doc.build(story)
	buffer.seek(0)
	return buffer


	def _create_executive_summary(query: str, total: int, avg_sentiment: float, dist_counts: Dict[str, int]) -> str:
	try:
	if total == 0:
	return f"No articles were available to analyze for “{query}”."

	label = "positive" if avg_sentiment > 0.1 else "negative" if avg_sentiment < -0.1 else "neutral"

	pos = dist_counts.get("Positive", 0)
	neg = dist_counts.get("Negative", 0)
	neu = dist_counts.get("Neutral", 0)

	pct_pos = _safe_div(pos, total) * 100.0
	pct_neg = _safe_div(neg, total) * 100.0
	pct_neu = _safe_div(neu, total) * 100.0

	summary = (
	f"This report analyzes {total} news articles related to “{query}”. "
	f"The overall sentiment reveals a {label} tone with an average sentiment score of {avg_sentiment:.3f}. "
	f"The analysis shows {pos} positive articles ({pct_pos:.1f}%), "
	f"{neg} negative articles ({pct_neg:.1f}%), and {neu} neutral articles ({pct_neu:.1f}%). "
	)

	if avg_sentiment > 0.2:
	summary += "Predominantly positive coverage suggests favorable market conditions or public perception."
	elif avg_sentiment < -0.2:
	summary += "Predominantly negative coverage indicates concerns or challenges that may require attention."
	else:
	summary += "Balanced coverage suggests a mixed outlook with both opportunities and challenges."
	return summary
	except Exception as e:
	logger.exception(f"Executive summary creation failed: {e}")
	return "Analysis completed successfully with comprehensive sentiment evaluation across multiple news sources."


	def _create_sentiment_section(total: int, dist_counts: Dict[str, int], styles) -> List[Any]:
	story: List[Any] = []
	try:
	pos = dist_counts.get("Positive", 0)
	neg = dist_counts.get("Negative", 0)
	neu = dist_counts.get("Neutral", 0)

	data = [
	['Sentiment', 'Count', 'Percentage'],
	['Positive', str(pos), f"{_safe_div(pos, total) * 100:.1f}%"],
	['Negative', str(neg), f"{_safe_div(neg, total) * 100:.1f}%"],
	['Neutral', str(neu), f"{_safe_div(neu, total) * 100:.1f}%"],
	]

	table = Table(data)
	table.setStyle(TableStyle([
	('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
	('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
	('ALIGN', (0, 0), (-1, -1), 'CENTER'),
	('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
	('FONTSIZE', (0, 0), (-1, 0), 12),
	('BOTTOMPADDING', (0, 0), (-1, 0), 10),
	('BACKGROUND', (0, 1), (-1, -1), colors.beige),
	('GRID', (0, 0), (-1, -1), 1, colors.black),
	]))
	story.append(table)
	story.append(Spacer(1, 0.2 * inch))

	explanation = (
	"Sentiment analysis was performed using multiple models including VADER, "
	"Loughran–McDonald (financial), and FinBERT. Scores range from -1.0 (most negative) "
	"to +1.0 (most positive), with -0.1 to +0.1 considered neutral."
	)
	story.append(Paragraph(explanation, styles['Normal']))
	story.append(Spacer(1, 0.1 * inch))
	except Exception as e:
	logger.exception(f"Sentiment section creation failed: {e}")
	story.append(Paragraph("Sentiment analysis data unavailable.", styles['Normal']))
	return story


	def _create_stories_section(results: Dict[str, Any], styles) -> List[Any]:
	story: List[Any] = []
	try:
	articles = results.get('articles', []) or []
	if not articles:
	story.append(Paragraph("No articles available for analysis.", styles['Normal']))
	return story

	# Sort by compound sentiment
	sorted_by_pos = sorted(articles, key=lambda x: (x.get('sentiment') or {}).get('compound', 0.0), reverse=True)
	sorted_by_neg = sorted(articles, key=lambda x: (x.get('sentiment') or {}).get('compound', 0.0))

	# Most positive
	if sorted_by_pos and (sorted_by_pos[0].get('sentiment') or {}).get('compound', 0.0) > 0.1:
	a = sorted_by_pos[0]
	story.append(Paragraph("Most Positive Coverage:", styles['Heading3']))
	story.append(Paragraph(f"<b>Title:</b> {a.get('title','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Source:</b> {a.get('source','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Sentiment Score:</b> {(a.get('sentiment') or {}).get('compound', 0.0):.3f}", styles['Normal']))
	if a.get('summary'):
	story.append(Paragraph(f"<b>Summary:</b> {a['summary'][:300]}{'...' if len(a['summary'])>300 else ''}", styles['Normal']))
	story.append(Spacer(1, 0.15 * inch))

	# Most negative
	if sorted_by_neg and (sorted_by_neg[0].get('sentiment') or {}).get('compound', 0.0) < -0.1:
	a = sorted_by_neg[0]
	story.append(Paragraph("Most Negative Coverage:", styles['Heading3']))
	story.append(Paragraph(f"<b>Title:</b> {a.get('title','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Source:</b> {a.get('source','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Sentiment Score:</b> {(a.get('sentiment') or {}).get('compound', 0.0):.3f}", styles['Normal']))
	if a.get('summary'):
	story.append(Paragraph(f"<b>Summary:</b> {a['summary'][:300]}{'...' if len(a['summary'])>300 else ''}", styles['Normal']))

	# Latest coverage (if dates are present)
	recent = [a for a in articles if a.get('date')]
	if recent:
	try:
	recent.sort(key=lambda x: x.get('date'), reverse=True)
	r = recent[0]
	story.append(Spacer(1, 0.15 * inch))
	story.append(Paragraph("Most Recent Coverage:", styles['Heading3']))
	story.append(Paragraph(f"<b>Title:</b> {r.get('title','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Source:</b> {r.get('source','N/A')}", styles['Normal']))
	story.append(Paragraph(f"<b>Date:</b> {r.get('date')}", styles['Normal']))
	story.append(Paragraph(f"<b>Sentiment Score:</b> {(r.get('sentiment') or {}).get('compound', 0.0):.3f}", styles['Normal']))
	except Exception:
	pass

	except Exception as e:
	logger.exception(f"Stories section creation failed: {e}")
	story.append(Paragraph("Story analysis data unavailable.", styles['Normal']))
	return story


	def _create_keywords_section(keywords: List[Dict[str, Any]], styles) -> List[Any]:
	story: List[Any] = []
	try:
	top = keywords[:15]
	if not top:
	story.append(Paragraph("No keywords extracted.", styles['Normal']))
	return story

	data = [['Keyword', 'Score', 'Category']]
	for kw in top:
	score = kw.get('score', 0.0)
	relevance = kw.get('relevance', 'medium')
	data.append([kw.get('keyword', 'N/A'), f"{score:.3f}", str(relevance).title()])

	table = Table(data)
	table.setStyle(TableStyle([
	('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
	('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
	('ALIGN', (0, 0), (-1, -1), 'LEFT'),
	('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
	('FONTSIZE', (0, 0), (-1, 0), 10),
	('BOTTOMPADDING', (0, 0), (-1, 0), 10),
	('BACKGROUND', (0, 1), (-1, -1), colors.beige),
	('GRID', (0, 0), (-1, -1), 1, colors.black),
	]))
	story.append(table)
	story.append(Spacer(1, 0.15 * inch))

	expl = ("Keywords were extracted using the YAKE algorithm, which identifies relevant terms and phrases "
	"based on statistical features of the text corpus.")
	story.append(Paragraph(expl, styles['Normal']))
	except Exception as e:
	logger.exception(f"Keywords section creation failed: {e}")
	story.append(Paragraph("Keyword analysis data unavailable.", styles['Normal']))
	return story


	def _create_sources_section(results: Dict[str, Any], styles) -> List[Any]:
	story: List[Any] = []
	try:
	articles = results.get('articles', []) or []
	if not articles:
	story.append(Paragraph("No source data available.", styles['Normal']))
	return story

	# Count sources
	counts: Dict[str, int] = {}
	for a in articles:
	src = a.get('source', 'Unknown')
	counts[src] = counts.get(src, 0) + 1

	total = len(articles)
	data = [['News Source', 'Article Count', 'Percentage']]
	for src, ct in sorted(counts.items(), key=lambda x: x[1], reverse=True):
	data.append([src, str(ct), f"{_safe_div(ct, total) * 100:.1f}%"])

	table = Table(data)
	table.setStyle(TableStyle([
	('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
	('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
	('ALIGN', (0, 0), (-1, -1), 'LEFT'),
	('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
	('FONTSIZE', (0, 0), (-1, 0), 10),
	('BOTTOMPADDING', (0, 0), (-1, 0), 10),
	('BACKGROUND', (0, 1), (-1, -1), colors.beige),
	('GRID', (0, 0), (-1, -1), 1, colors.black),
	]))
	story.append(table)
	story.append(Spacer(1, 0.15 * inch))

	expl = (f"Articles were collected from {len(counts)} different sources, providing diverse perspectives. "
	"Source diversity helps ensure comprehensive coverage and reduces bias.")
	story.append(Paragraph(expl, styles['Normal']))
	except Exception as e:
	logger.exception(f"Sources section creation failed: {e}")
	story.append(Paragraph("Source analysis data unavailable.", styles['Normal']))
	return story


	def _create_methodology_section(results: Dict[str, Any], total: int, proc_time: float) -> str:
	meth = (
	"This analysis employed a comprehensive NLP pipeline:\n\n"
	"1. <b>Data Collection:</b> Articles were gathered from multiple RSS/business feeds. "
	"Content was filtered for relevance and deduplicated.\n\n"
	"2. <b>Sentiment Analysis:</b> VADER (general), Loughran–McDonald (finance), and FinBERT (finance) were combined. "
	"Final scores reflect a weighted composite.\n\n"
	"3. <b>Summarization & Keywords:</b> Articles were cleaned and summarized (transformer models when available), "
	"and key themes extracted with YAKE.\n\n"
	"4. <b>Quality Controls:</b> English-only filtering, minimum length checks, and relevance filters.\n\n"
	)
	try:
	meth += f"Processed {total} articles in {proc_time:.2f} seconds."
	except Exception:
	pass
	return meth


	# -------------------------------
	# FPDF fallback
	# -------------------------------
	def _generate_simple_pdf_fallback(results: Dict[str, Any]) -> io.BytesIO:
	total, dist_counts, avg = _norm_dist_from_results(results)
	query = results.get('query', 'N/A')

	pdf = FPDF()
	pdf.add_page()
	pdf.set_font('Arial', 'B', 16)
	pdf.cell(0, 10, 'News Analysis Report', ln=True)
	pdf.ln(5)

	pdf.set_font('Arial', '', 12)
	pdf.cell(0, 8, f"Query: {query}", ln=True)
	pdf.cell(0, 8, f"Articles: {total}", ln=True)
	pdf.cell(0, 8, f"Average Sentiment: {avg:.3f}", ln=True)
	pdf.ln(5)

	pos, neg, neu = dist_counts.get("Positive", 0), dist_counts.get("Negative", 0), dist_counts.get("Neutral", 0)
	pdf.cell(0, 8, "Sentiment Distribution:", ln=True)
	pdf.cell(0, 8, f" Positive: {pos} ({_safe_div(pos, total)*100:.1f}%)", ln=True)
	pdf.cell(0, 8, f" Negative: {neg} ({_safe_div(neg, total)*100:.1f}%)", ln=True)
	pdf.cell(0, 8, f" Neutral: {neu} ({_safe_div(neu, total)*100:.1f}%)", ln=True)

	buf = io.BytesIO()
	pdf_bytes = pdf.output(dest='S').encode('latin1')
	buf.write(pdf_bytes)
	buf.seek(0)
	return buf


	# -------------------------------
	# Optional chart image (base64)
	# -------------------------------
	def create_chart_image(data: Dict, chart_type: str = 'pie') -> Optional[str]:
	if not MATPLOTLIB_AVAILABLE:
	return None
	try:
	plt.figure(figsize=(6, 4))
	if chart_type == 'pie':
	# Support both shapes
	total, dist_counts, _ = _norm_dist_from_results(data if 'articles' in data else {'summary': {'distribution': data}})
	labels = ['Positive', 'Negative', 'Neutral']
	sizes = [
	dist_counts.get('Positive', 0),
	dist_counts.get('Negative', 0),
	dist_counts.get('Neutral', 0),
	]
	plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
	plt.title('Sentiment Distribution')
	elif chart_type == 'bar' and 'articles' in data:
	sources: Dict[str, int] = {}
	for a in data.get('articles', []):
	s = a.get('source', 'Unknown')
	sources[s] = sources.get(s, 0) + 1
	top = dict(sorted(sources.items(), key=lambda x: x[1], reverse=True)[:10])
	plt.bar(range(len(top)), list(top.values()))
	plt.xticks(range(len(top)), list(top.keys()), rotation=45, ha='right')
	plt.title('Articles by Source')
	plt.ylabel('Count')
	plt.tight_layout()

	buf = io.BytesIO()
	plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
	buf.seek(0)
	img64 = base64.b64encode(buf.getvalue()).decode()
	plt.close()
	return img64
	except Exception as e:
	logger.exception(f"Chart creation failed: {e}")
	return None


	# -------------------------------
	# CSV / JSON helpers (unchanged public API)
	# -------------------------------
	def generate_csv_report(results: Dict[str, Any]) -> str:
	try:
	import csv
	import io as _io
	out = _io.StringIO()
	w = csv.writer(out)
	w.writerow(['Title', 'Source', 'URL', 'Date', 'Sentiment_Score', 'Sentiment_Label',
	'VADER_Score', 'LM_Score', 'FinBERT_Score', 'Summary'])
	for a in results.get('articles', []):
	s = a.get('sentiment', {}) or {}
	compound = s.get('compound', 0.0)
	if compound > 0.1:
	label = 'Positive'
	elif compound < -0.1:
	label = 'Negative'
	else:
	label = 'Neutral'
	w.writerow([
	a.get('title', ''),
	a.get('source', ''),
	a.get('url', ''),
	a.get('date', ''),
	compound,
	label,
	s.get('vader', ''),
	s.get('loughran_mcdonald', ''),
	s.get('finbert', ''),
	(a.get('summary', '')[:200] + '...') if len(a.get('summary', '') or '') > 200 else a.get('summary', '')
	])
	return out.getvalue()
	except Exception as e:
	logger.exception(f"CSV generation failed: {e}")
	return "Error generating CSV report"


	def generate_json_report(results: Dict[str, Any]) -> str:
	try:
	import json
	meta = {
	'report_generated': datetime.now().isoformat(),
	'query': results.get('query', ''),
	'languages': results.get('languages', ['English']),
	}
	total, dist_counts, avg = _norm_dist_from_results(results)
	summary = {
	'total_articles': total,
	'average_sentiment': avg,
	'sentiment_distribution': dist_counts,
	'top_sources': _get_top_sources(results),
	}
	report = {
	'metadata': meta,
	'summary': summary,
	'articles': results.get('articles', []),
	'keywords': (results.get('keywords', []) or [])[:20],
	'analysis_methods': {
	'sentiment_models': ['VADER', 'Loughran-McDonald', 'FinBERT'],
	'summarization_model': 'BART/DistilBART/T5 (when available)',
	'keyword_extraction': 'YAKE',
	'translation_models': ['Helsinki-NLP Opus-MT']
	}
	}
	return json.dumps(report, indent=2, default=str, ensure_ascii=False)
	except Exception as e:
	logger.exception(f"JSON generation failed: {e}")
	try:
	import json
	return json.dumps({'error': str(e)}, indent=2)
	except Exception:
	return '{"error":"JSON generation failed"}'


	def _get_top_sources(results: Dict[str, Any]) -> List[Dict[str, Any]]:
	try:
	arts = results.get('articles', []) or []
	total = len(arts)
	counts: Dict[str, int] = {}
	for a in arts:
	src = a.get('source', 'Unknown')
	counts[src] = counts.get(src, 0) + 1
	items = [
	{'source': s, 'count': c, 'percentage': round(_safe_div(c, total) * 100.0, 1)}
	for s, c in counts.items()
	]
	return sorted(items, key=lambda x: x['count'], reverse=True)[:10]
	except Exception as e:
	logger.exception(f"Top sources calculation failed: {e}")
	return []


	def validate_report_data(results: Dict[str, Any]) -> bool:
	"""
	Validate that results contain required data for reporting.
	We’re lenient now: require 'articles' and 'query'.
	"""
	if 'query' not in results or 'articles' not in results:
	logger.error("Missing required keys: 'query' and/or 'articles'")
	return False
	if not isinstance(results['articles'], list) or len(results['articles']) == 0:
	logger.error("No articles available for reporting")
	return False
	return True


	__all__ = [
	'generate_pdf_report',
	'generate_csv_report',
	'generate_json_report',
	'create_chart_image',
	'validate_report_data',
	]