Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- api_backend (1).py +355 -0
- config_json.json +51 -0
- dockerfile.txt +46 -0
- gitattributes_file.txt +33 -0
- nlp_module (1).py +464 -0
- report_module (1).py +606 -0
- requirements_file.txt +47 -0
- scraper_module.py +396 -0
- streamlit_app.py +562 -0
- summarizer_module.py +400 -0
- translator_module (1).py +336 -0
- tts_module.py +336 -0
- utils_module (1).py +442 -0
api_backend (1).py
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from typing import List, Optional, Dict, Any
|
| 5 |
+
import asyncio
|
| 6 |
+
import logging
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import json
|
| 9 |
+
|
| 10 |
+
# Import our modules
|
| 11 |
+
from scraper import NewsletterScraper
|
| 12 |
+
from nlp import SentimentAnalyzer, KeywordExtractor
|
| 13 |
+
from summarizer import TextSummarizer
|
| 14 |
+
from translator import MultilingualTranslator
|
| 15 |
+
from tts import AudioGenerator
|
| 16 |
+
from utils import setup_logging, cache_results
|
| 17 |
+
|
| 18 |
+
# Setup logging
|
| 19 |
+
setup_logging()
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
# FastAPI app
|
| 23 |
+
app = FastAPI(
|
| 24 |
+
title="Global Business News Intelligence API",
|
| 25 |
+
description="Advanced news analysis with sentiment, summarization, and multilingual support",
|
| 26 |
+
version="1.0.0"
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# CORS middleware
|
| 30 |
+
app.add_middleware(
|
| 31 |
+
CORSMiddleware,
|
| 32 |
+
allow_origins=["*"],
|
| 33 |
+
allow_credentials=True,
|
| 34 |
+
allow_methods=["*"],
|
| 35 |
+
allow_headers=["*"],
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
class AnalysisRequest(BaseModel):
|
| 39 |
+
query: str
|
| 40 |
+
num_articles: int = 20
|
| 41 |
+
languages: List[str] = ["English"]
|
| 42 |
+
include_audio: bool = True
|
| 43 |
+
sentiment_models: List[str] = ["VADER", "Loughran-McDonald", "FinBERT"]
|
| 44 |
+
|
| 45 |
+
class AnalysisResponse(BaseModel):
|
| 46 |
+
query: str
|
| 47 |
+
total_articles: int
|
| 48 |
+
processing_time: float
|
| 49 |
+
average_sentiment: float
|
| 50 |
+
sentiment_distribution: Dict[str, int]
|
| 51 |
+
articles: List[Dict[str, Any]]
|
| 52 |
+
keywords: List[Dict[str, Any]]
|
| 53 |
+
summary: Dict[str, Any]
|
| 54 |
+
languages: List[str]
|
| 55 |
+
audio_files: Optional[Dict[str, str]] = None
|
| 56 |
+
|
| 57 |
+
class NewsAnalyzer:
|
| 58 |
+
"""Main news analysis orchestrator"""
|
| 59 |
+
|
| 60 |
+
def __init__(self):
|
| 61 |
+
self.scraper = NewsletterScraper()
|
| 62 |
+
self.sentiment_analyzer = SentimentAnalyzer()
|
| 63 |
+
self.keyword_extractor = KeywordExtractor()
|
| 64 |
+
self.summarizer = TextSummarizer()
|
| 65 |
+
self.translator = MultilingualTranslator()
|
| 66 |
+
self.audio_generator = AudioGenerator()
|
| 67 |
+
|
| 68 |
+
logger.info("NewsAnalyzer initialized successfully")
|
| 69 |
+
|
| 70 |
+
async def analyze_news_async(self, config: Dict[str, Any], progress_callback=None) -> Dict[str, Any]:
|
| 71 |
+
"""Async version of analyze_news"""
|
| 72 |
+
return self.analyze_news(config, progress_callback)
|
| 73 |
+
|
| 74 |
+
def analyze_news(self, config: Dict[str, Any], progress_callback=None) -> Dict[str, Any]:
|
| 75 |
+
"""Main analysis pipeline"""
|
| 76 |
+
start_time = datetime.now()
|
| 77 |
+
|
| 78 |
+
try:
|
| 79 |
+
query = config['query']
|
| 80 |
+
num_articles = config.get('num_articles', 20)
|
| 81 |
+
languages = config.get('languages', ['English'])
|
| 82 |
+
include_audio = config.get('include_audio', True)
|
| 83 |
+
sentiment_models = config.get('sentiment_models', ['VADER', 'Loughran-McDonald', 'FinBERT'])
|
| 84 |
+
|
| 85 |
+
logger.info(f"Starting analysis for query: {query}")
|
| 86 |
+
|
| 87 |
+
if progress_callback:
|
| 88 |
+
progress_callback(10, "Scraping articles...")
|
| 89 |
+
|
| 90 |
+
# Step 1: Scrape articles
|
| 91 |
+
articles = self.scraper.scrape_news(query, num_articles)
|
| 92 |
+
logger.info(f"Scraped {len(articles)} articles")
|
| 93 |
+
|
| 94 |
+
if not articles:
|
| 95 |
+
raise ValueError("No articles found for the given query")
|
| 96 |
+
|
| 97 |
+
if progress_callback:
|
| 98 |
+
progress_callback(30, "Analyzing sentiment...")
|
| 99 |
+
|
| 100 |
+
# Step 2: Sentiment analysis
|
| 101 |
+
for article in articles:
|
| 102 |
+
article['sentiment'] = self.sentiment_analyzer.analyze_sentiment(
|
| 103 |
+
article['content'],
|
| 104 |
+
models=sentiment_models
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
if progress_callback:
|
| 108 |
+
progress_callback(50, "Extracting keywords...")
|
| 109 |
+
|
| 110 |
+
# Step 3: Keyword extraction
|
| 111 |
+
all_text = ' '.join([article['content'] for article in articles])
|
| 112 |
+
keywords = self.keyword_extractor.extract_keywords(all_text)
|
| 113 |
+
|
| 114 |
+
if progress_callback:
|
| 115 |
+
progress_callback(60, "Generating summaries...")
|
| 116 |
+
|
| 117 |
+
# Step 4: Summarization
|
| 118 |
+
for article in articles:
|
| 119 |
+
article['summary'] = self.summarizer.summarize(article['content'])
|
| 120 |
+
|
| 121 |
+
# Multilingual summaries
|
| 122 |
+
if len(languages) > 1:
|
| 123 |
+
article['summaries'] = {}
|
| 124 |
+
for lang in languages:
|
| 125 |
+
if lang != 'English':
|
| 126 |
+
article['summaries'][lang] = self.translator.translate(
|
| 127 |
+
article['summary'],
|
| 128 |
+
target_lang=lang
|
| 129 |
+
)
|
| 130 |
+
else:
|
| 131 |
+
article['summaries'][lang] = article['summary']
|
| 132 |
+
|
| 133 |
+
if progress_callback:
|
| 134 |
+
progress_callback(80, "Generating audio...")
|
| 135 |
+
|
| 136 |
+
# Step 5: Audio generation
|
| 137 |
+
audio_files = {}
|
| 138 |
+
if include_audio and languages:
|
| 139 |
+
# Create overall summary for audio
|
| 140 |
+
overall_summary = self.create_overall_summary(articles, keywords)
|
| 141 |
+
|
| 142 |
+
for lang in languages:
|
| 143 |
+
if lang in ['English', 'Hindi', 'Tamil']:
|
| 144 |
+
try:
|
| 145 |
+
if lang != 'English':
|
| 146 |
+
summary_text = self.translator.translate(overall_summary, target_lang=lang)
|
| 147 |
+
else:
|
| 148 |
+
summary_text = overall_summary
|
| 149 |
+
|
| 150 |
+
audio_file = self.audio_generator.generate_audio(
|
| 151 |
+
summary_text,
|
| 152 |
+
language=lang,
|
| 153 |
+
output_file=f"summary_{lang.lower()}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
|
| 154 |
+
)
|
| 155 |
+
audio_files[lang] = audio_file
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.error(f"Error generating audio for {lang}: {str(e)}")
|
| 158 |
+
|
| 159 |
+
if progress_callback:
|
| 160 |
+
progress_callback(90, "Finalizing results...")
|
| 161 |
+
|
| 162 |
+
# Step 6: Calculate summary statistics
|
| 163 |
+
sentiments = [article['sentiment']['compound'] for article in articles]
|
| 164 |
+
average_sentiment = sum(sentiments) / len(sentiments) if sentiments else 0.0
|
| 165 |
+
|
| 166 |
+
sentiment_distribution = {
|
| 167 |
+
'Positive': sum(1 for s in sentiments if s > 0.1),
|
| 168 |
+
'Negative': sum(1 for s in sentiments if s < -0.1),
|
| 169 |
+
'Neutral': sum(1 for s in sentiments if -0.1 <= s <= 0.1)
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
# Step 7: Prepare results
|
| 173 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 174 |
+
|
| 175 |
+
results = {
|
| 176 |
+
'query': query,
|
| 177 |
+
'total_articles': len(articles),
|
| 178 |
+
'processing_time': processing_time,
|
| 179 |
+
'average_sentiment': average_sentiment,
|
| 180 |
+
'sentiment_distribution': sentiment_distribution,
|
| 181 |
+
'articles': articles,
|
| 182 |
+
'keywords': keywords,
|
| 183 |
+
'languages': languages,
|
| 184 |
+
'audio_files': audio_files,
|
| 185 |
+
'summary': {
|
| 186 |
+
'average_sentiment': average_sentiment,
|
| 187 |
+
'total_articles': len(articles),
|
| 188 |
+
'sources': len(set([article['source'] for article in articles])),
|
| 189 |
+
'date_range': self.get_date_range(articles)
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
if progress_callback:
|
| 194 |
+
progress_callback(100, "Analysis complete!")
|
| 195 |
+
|
| 196 |
+
logger.info(f"Analysis completed successfully in {processing_time:.2f} seconds")
|
| 197 |
+
return results
|
| 198 |
+
|
| 199 |
+
except Exception as e:
|
| 200 |
+
logger.error(f"Error in analysis pipeline: {str(e)}")
|
| 201 |
+
raise e
|
| 202 |
+
|
| 203 |
+
def create_overall_summary(self, articles: List[Dict], keywords: List[Dict]) -> str:
|
| 204 |
+
"""Create an overall summary for audio generation"""
|
| 205 |
+
try:
|
| 206 |
+
# Get top keywords
|
| 207 |
+
top_keywords = [kw['keyword'] for kw in keywords[:10]]
|
| 208 |
+
|
| 209 |
+
# Calculate sentiment distribution
|
| 210 |
+
positive_count = sum(1 for article in articles if article['sentiment']['compound'] > 0.1)
|
| 211 |
+
negative_count = sum(1 for article in articles if article['sentiment']['compound'] < -0.1)
|
| 212 |
+
neutral_count = len(articles) - positive_count - negative_count
|
| 213 |
+
|
| 214 |
+
# Create summary text
|
| 215 |
+
summary = f"Analysis of {len(articles)} articles reveals "
|
| 216 |
+
|
| 217 |
+
if positive_count > negative_count:
|
| 218 |
+
summary += f"predominantly positive sentiment with {positive_count} positive, {negative_count} negative, and {neutral_count} neutral articles. "
|
| 219 |
+
elif negative_count > positive_count:
|
| 220 |
+
summary += f"predominantly negative sentiment with {negative_count} negative, {positive_count} positive, and {neutral_count} neutral articles. "
|
| 221 |
+
else:
|
| 222 |
+
summary += f"mixed sentiment with balanced coverage. "
|
| 223 |
+
|
| 224 |
+
if top_keywords:
|
| 225 |
+
summary += f"Key topics include: {', '.join(top_keywords[:5])}. "
|
| 226 |
+
|
| 227 |
+
# Add top stories
|
| 228 |
+
top_positive = sorted(articles, key=lambda x: x['sentiment']['compound'], reverse=True)[:2]
|
| 229 |
+
top_negative = sorted(articles, key=lambda x: x['sentiment']['compound'])[:2]
|
| 230 |
+
|
| 231 |
+
if top_positive[0]['sentiment']['compound'] > 0.1:
|
| 232 |
+
summary += f"Most positive coverage: {top_positive[0]['title'][:100]}. "
|
| 233 |
+
|
| 234 |
+
if top_negative[0]['sentiment']['compound'] < -0.1:
|
| 235 |
+
summary += f"Most concerning coverage: {top_negative[0]['title'][:100]}. "
|
| 236 |
+
|
| 237 |
+
return summary
|
| 238 |
+
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.error(f"Error creating overall summary: {str(e)}")
|
| 241 |
+
return f"Analysis of {len(articles)} articles completed successfully."
|
| 242 |
+
|
| 243 |
+
def get_date_range(self, articles: List[Dict]) -> Dict[str, str]:
|
| 244 |
+
"""Get the date range of articles"""
|
| 245 |
+
try:
|
| 246 |
+
dates = [article['date'] for article in articles if 'date' in article and article['date']]
|
| 247 |
+
if dates:
|
| 248 |
+
dates = [d for d in dates if d is not None]
|
| 249 |
+
if dates:
|
| 250 |
+
min_date = min(dates)
|
| 251 |
+
max_date = max(dates)
|
| 252 |
+
return {
|
| 253 |
+
'start': str(min_date),
|
| 254 |
+
'end': str(max_date)
|
| 255 |
+
}
|
| 256 |
+
return {'start': 'Unknown', 'end': 'Unknown'}
|
| 257 |
+
except Exception as e:
|
| 258 |
+
logger.error(f"Error getting date range: {str(e)}")
|
| 259 |
+
return {'start': 'Unknown', 'end': 'Unknown'}
|
| 260 |
+
|
| 261 |
+
# Initialize the analyzer
|
| 262 |
+
analyzer = NewsAnalyzer()
|
| 263 |
+
|
| 264 |
+
# FastAPI endpoints
|
| 265 |
+
@app.get("/", response_model=Dict[str, str])
|
| 266 |
+
async def root():
|
| 267 |
+
"""API root endpoint"""
|
| 268 |
+
return {
|
| 269 |
+
"message": "Global Business News Intelligence API",
|
| 270 |
+
"version": "1.0.0",
|
| 271 |
+
"docs": "/docs"
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
@app.get("/health", response_model=Dict[str, str])
|
| 275 |
+
async def health_check():
|
| 276 |
+
"""Health check endpoint"""
|
| 277 |
+
return {"status": "healthy", "timestamp": datetime.now().isoformat()}
|
| 278 |
+
|
| 279 |
+
@app.get("/api/analyze", response_model=AnalysisResponse)
|
| 280 |
+
async def analyze_news_endpoint(
|
| 281 |
+
query: str = Query(..., description="Company name, ticker, or keyword to analyze"),
|
| 282 |
+
num_articles: int = Query(20, description="Number of articles to analyze (5-50)", ge=5, le=50),
|
| 283 |
+
languages: List[str] = Query(["English"], description="Languages for summaries"),
|
| 284 |
+
include_audio: bool = Query(True, description="Generate audio summaries"),
|
| 285 |
+
sentiment_models: List[str] = Query(["VADER", "Loughran-McDonald", "FinBERT"], description="Sentiment models to use")
|
| 286 |
+
):
|
| 287 |
+
"""Main analysis endpoint"""
|
| 288 |
+
try:
|
| 289 |
+
config = {
|
| 290 |
+
'query': query,
|
| 291 |
+
'num_articles': num_articles,
|
| 292 |
+
'languages': languages,
|
| 293 |
+
'include_audio': include_audio,
|
| 294 |
+
'sentiment_models': sentiment_models
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
results = await analyzer.analyze_news_async(config)
|
| 298 |
+
|
| 299 |
+
return AnalysisResponse(**results)
|
| 300 |
+
|
| 301 |
+
except Exception as e:
|
| 302 |
+
logger.error(f"Error in analyze endpoint: {str(e)}")
|
| 303 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 304 |
+
|
| 305 |
+
@app.post("/api/analyze", response_model=AnalysisResponse)
|
| 306 |
+
async def analyze_news_post(request: AnalysisRequest):
|
| 307 |
+
"""POST version of analysis endpoint"""
|
| 308 |
+
try:
|
| 309 |
+
config = request.dict()
|
| 310 |
+
results = await analyzer.analyze_news_async(config)
|
| 311 |
+
return AnalysisResponse(**results)
|
| 312 |
+
|
| 313 |
+
except Exception as e:
|
| 314 |
+
logger.error(f"Error in analyze POST endpoint: {str(e)}")
|
| 315 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 316 |
+
|
| 317 |
+
@app.get("/api/sources", response_model=List[str])
|
| 318 |
+
async def get_available_sources():
|
| 319 |
+
"""Get list of available news sources"""
|
| 320 |
+
return analyzer.scraper.get_available_sources()
|
| 321 |
+
|
| 322 |
+
@app.get("/api/models", response_model=Dict[str, List[str]])
|
| 323 |
+
async def get_available_models():
|
| 324 |
+
"""Get list of available models"""
|
| 325 |
+
return {
|
| 326 |
+
"sentiment_models": ["VADER", "Loughran-McDonald", "FinBERT"],
|
| 327 |
+
"summarization_models": ["distilbart-cnn-12-6"],
|
| 328 |
+
"translation_models": ["Helsinki-NLP/opus-mt-en-hi", "Helsinki-NLP/opus-mt-en-fi"],
|
| 329 |
+
"audio_languages": ["English", "Hindi", "Tamil"]
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
@app.get("/api/keywords/{query}", response_model=List[Dict[str, Any]])
|
| 333 |
+
async def extract_keywords_endpoint(
|
| 334 |
+
query: str,
|
| 335 |
+
num_keywords: int = Query(20, description="Number of keywords to extract", ge=5, le=50)
|
| 336 |
+
):
|
| 337 |
+
"""Extract keywords from a query or text"""
|
| 338 |
+
try:
|
| 339 |
+
# For demo purposes, we'll scrape a few articles and extract keywords
|
| 340 |
+
articles = analyzer.scraper.scrape_news(query, 5)
|
| 341 |
+
if not articles:
|
| 342 |
+
raise HTTPException(status_code=404, detail="No articles found for query")
|
| 343 |
+
|
| 344 |
+
all_text = ' '.join([article['content'] for article in articles])
|
| 345 |
+
keywords = analyzer.keyword_extractor.extract_keywords(all_text, num_keywords=num_keywords)
|
| 346 |
+
|
| 347 |
+
return keywords
|
| 348 |
+
|
| 349 |
+
except Exception as e:
|
| 350 |
+
logger.error(f"Error in keywords endpoint: {str(e)}")
|
| 351 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 352 |
+
|
| 353 |
+
if __name__ == "__main__":
|
| 354 |
+
import uvicorn
|
| 355 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
config_json.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_articles": 50,
|
| 3 |
+
"cache_ttl_hours": 6,
|
| 4 |
+
"supported_languages": ["English", "Hindi", "Tamil"],
|
| 5 |
+
"sentiment_models": ["VADER", "Loughran-McDonald", "FinBERT"],
|
| 6 |
+
"summarization_max_length": 150,
|
| 7 |
+
"summarization_min_length": 50,
|
| 8 |
+
"audio_enabled": true,
|
| 9 |
+
"translation_enabled": true,
|
| 10 |
+
"keyword_extraction_enabled": true,
|
| 11 |
+
"max_keywords": 20,
|
| 12 |
+
"debug_mode": false,
|
| 13 |
+
"huggingface_space_config": {
|
| 14 |
+
"title": "Global Business News Intelligence Dashboard",
|
| 15 |
+
"emoji": "📊",
|
| 16 |
+
"colorFrom": "blue",
|
| 17 |
+
"colorTo": "green",
|
| 18 |
+
"sdk": "streamlit",
|
| 19 |
+
"sdk_version": "1.28.1",
|
| 20 |
+
"app_file": "app.py",
|
| 21 |
+
"pinned": false,
|
| 22 |
+
"license": "mit"
|
| 23 |
+
},
|
| 24 |
+
"api_config": {
|
| 25 |
+
"host": "0.0.0.0",
|
| 26 |
+
"port": 8000,
|
| 27 |
+
"reload": false,
|
| 28 |
+
"workers": 1
|
| 29 |
+
},
|
| 30 |
+
"performance_settings": {
|
| 31 |
+
"max_concurrent_requests": 10,
|
| 32 |
+
"request_timeout_seconds": 300,
|
| 33 |
+
"memory_limit_gb": 4,
|
| 34 |
+
"cpu_optimization": true
|
| 35 |
+
},
|
| 36 |
+
"news_sources": {
|
| 37 |
+
"google_news": true,
|
| 38 |
+
"reuters": true,
|
| 39 |
+
"bbc": true,
|
| 40 |
+
"cnbc": true,
|
| 41 |
+
"bloomberg": true,
|
| 42 |
+
"marketwatch": true,
|
| 43 |
+
"financial_times": false
|
| 44 |
+
},
|
| 45 |
+
"model_settings": {
|
| 46 |
+
"use_cpu_only": true,
|
| 47 |
+
"model_cache_dir": "./model_cache",
|
| 48 |
+
"download_models_on_startup": false,
|
| 49 |
+
"optimize_for_inference": true
|
| 50 |
+
}
|
| 51 |
+
}
|
dockerfile.txt
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use official Python runtime as base image
|
| 2 |
+
FROM python:3.9-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Set environment variables
|
| 8 |
+
ENV PYTHONUNBUFFERED=1
|
| 9 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 10 |
+
ENV STREAMLIT_SERVER_HEADLESS=true
|
| 11 |
+
ENV STREAMLIT_SERVER_PORT=7860
|
| 12 |
+
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
| 13 |
+
ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
| 14 |
+
|
| 15 |
+
# Install system dependencies
|
| 16 |
+
RUN apt-get update && apt-get install -y \
|
| 17 |
+
build-essential \
|
| 18 |
+
curl \
|
| 19 |
+
software-properties-common \
|
| 20 |
+
git \
|
| 21 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Copy requirements first for better caching
|
| 24 |
+
COPY requirements.txt .
|
| 25 |
+
|
| 26 |
+
# Install Python dependencies
|
| 27 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 28 |
+
pip install --no-cache-dir -r requirements.txt
|
| 29 |
+
|
| 30 |
+
# Download NLTK data
|
| 31 |
+
RUN python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('punkt'); nltk.download('stopwords')"
|
| 32 |
+
|
| 33 |
+
# Copy application code
|
| 34 |
+
COPY . .
|
| 35 |
+
|
| 36 |
+
# Create necessary directories
|
| 37 |
+
RUN mkdir -p logs cache model_cache temp
|
| 38 |
+
|
| 39 |
+
# Expose port
|
| 40 |
+
EXPOSE 7860
|
| 41 |
+
|
| 42 |
+
# Health check
|
| 43 |
+
HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
|
| 44 |
+
|
| 45 |
+
# Run application
|
| 46 |
+
CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
gitattributes_file.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
nlp_module (1).py
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import string
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Dict, List, Any, Optional
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
from collections import Counter
|
| 8 |
+
|
| 9 |
+
# NLTK imports
|
| 10 |
+
import nltk
|
| 11 |
+
try:
|
| 12 |
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
| 13 |
+
from nltk.corpus import stopwords
|
| 14 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
| 15 |
+
from nltk.stem import PorterStemmer
|
| 16 |
+
except ImportError:
|
| 17 |
+
pass
|
| 18 |
+
|
| 19 |
+
# Download required NLTK data
|
| 20 |
+
try:
|
| 21 |
+
nltk.download('vader_lexicon', quiet=True)
|
| 22 |
+
nltk.download('punkt', quiet=True)
|
| 23 |
+
nltk.download('stopwords', quiet=True)
|
| 24 |
+
except:
|
| 25 |
+
pass
|
| 26 |
+
|
| 27 |
+
# Transformers for FinBERT
|
| 28 |
+
try:
|
| 29 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
| 30 |
+
import torch
|
| 31 |
+
except ImportError:
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
# YAKE for keyword extraction
|
| 35 |
+
try:
|
| 36 |
+
import yake
|
| 37 |
+
except ImportError:
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
logger = logging.getLogger(__name__)
|
| 41 |
+
|
| 42 |
+
class SentimentAnalyzer:
|
| 43 |
+
"""Multi-model sentiment analysis"""
|
| 44 |
+
|
| 45 |
+
def __init__(self):
|
| 46 |
+
self.vader_analyzer = None
|
| 47 |
+
self.finbert_pipeline = None
|
| 48 |
+
self.loughran_mcdonald_dict = None
|
| 49 |
+
|
| 50 |
+
self._initialize_models()
|
| 51 |
+
logger.info("SentimentAnalyzer initialized")
|
| 52 |
+
|
| 53 |
+
def _initialize_models(self):
|
| 54 |
+
"""Initialize all sentiment analysis models"""
|
| 55 |
+
# VADER
|
| 56 |
+
try:
|
| 57 |
+
self.vader_analyzer = SentimentIntensityAnalyzer()
|
| 58 |
+
logger.info("VADER model loaded")
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.error(f"Failed to load VADER: {str(e)}")
|
| 61 |
+
|
| 62 |
+
# FinBERT
|
| 63 |
+
try:
|
| 64 |
+
model_name = "ProsusAI/finbert"
|
| 65 |
+
self.finbert_pipeline = pipeline(
|
| 66 |
+
"sentiment-analysis",
|
| 67 |
+
model=model_name,
|
| 68 |
+
tokenizer=model_name,
|
| 69 |
+
device=0 if torch.cuda.is_available() else -1
|
| 70 |
+
)
|
| 71 |
+
logger.info("FinBERT model loaded")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.warning(f"Failed to load FinBERT, using CPU fallback: {str(e)}")
|
| 74 |
+
try:
|
| 75 |
+
model_name = "ProsusAI/finbert"
|
| 76 |
+
self.finbert_pipeline = pipeline(
|
| 77 |
+
"sentiment-analysis",
|
| 78 |
+
model=model_name,
|
| 79 |
+
tokenizer=model_name,
|
| 80 |
+
device=-1
|
| 81 |
+
)
|
| 82 |
+
logger.info("FinBERT model loaded on CPU")
|
| 83 |
+
except Exception as e2:
|
| 84 |
+
logger.error(f"Failed to load FinBERT completely: {str(e2)}")
|
| 85 |
+
|
| 86 |
+
# Loughran-McDonald Dictionary
|
| 87 |
+
try:
|
| 88 |
+
self.loughran_mcdonald_dict = self._load_loughran_mcdonald()
|
| 89 |
+
logger.info("Loughran-McDonald dictionary loaded")
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logger.error(f"Failed to load Loughran-McDonald dictionary: {str(e)}")
|
| 92 |
+
|
| 93 |
+
def _load_loughran_mcdonald(self) -> Dict[str, List[str]]:
|
| 94 |
+
"""Load Loughran-McDonald financial sentiment dictionary"""
|
| 95 |
+
# Simplified version with key financial sentiment words
|
| 96 |
+
return {
|
| 97 |
+
'positive': [
|
| 98 |
+
'profit', 'profitable', 'profitability', 'revenue', 'revenues', 'growth',
|
| 99 |
+
'growing', 'increase', 'increased', 'increasing', 'success', 'successful',
|
| 100 |
+
'gain', 'gains', 'benefit', 'benefits', 'improvement', 'improved', 'strong',
|
| 101 |
+
'stronger', 'excellent', 'outstanding', 'exceed', 'exceeded', 'exceeds',
|
| 102 |
+
'beat', 'beats', 'positive', 'optimistic', 'bullish', 'rise', 'rising',
|
| 103 |
+
'surge', 'surged', 'boom', 'booming', 'expand', 'expansion', 'opportunity',
|
| 104 |
+
'opportunities', 'advance', 'advances', 'achievement', 'achieve', 'winner'
|
| 105 |
+
],
|
| 106 |
+
'negative': [
|
| 107 |
+
'loss', 'losses', 'lose', 'losing', 'decline', 'declining', 'decrease',
|
| 108 |
+
'decreased', 'decreasing', 'fall', 'falling', 'drop', 'dropped', 'plunge',
|
| 109 |
+
'plunged', 'crash', 'crashed', 'failure', 'failed', 'weak', 'weakness',
|
| 110 |
+
'poor', 'worse', 'worst', 'bad', 'terrible', 'crisis', 'problem', 'problems',
|
| 111 |
+
'risk', 'risks', 'risky', 'concern', 'concerns', 'worried', 'worry',
|
| 112 |
+
'negative', 'pessimistic', 'bearish', 'bankruptcy', 'bankrupt', 'deficit',
|
| 113 |
+
'debt', 'lawsuit', 'sue', 'sued', 'investigation', 'fraud', 'scandal',
|
| 114 |
+
'volatility', 'volatile', 'uncertainty', 'uncertain', 'challenge', 'challenges'
|
| 115 |
+
]
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
def analyze_sentiment(self, text: str, models: List[str] = None) -> Dict[str, Any]:
|
| 119 |
+
"""Analyze sentiment using multiple models"""
|
| 120 |
+
if models is None:
|
| 121 |
+
models = ['VADER', 'Loughran-McDonald', 'FinBERT']
|
| 122 |
+
|
| 123 |
+
results = {}
|
| 124 |
+
|
| 125 |
+
# Clean text
|
| 126 |
+
cleaned_text = self._clean_text(text)
|
| 127 |
+
|
| 128 |
+
# VADER Analysis
|
| 129 |
+
if 'VADER' in models and self.vader_analyzer:
|
| 130 |
+
try:
|
| 131 |
+
vader_scores = self.vader_analyzer.polarity_scores(cleaned_text)
|
| 132 |
+
results['vader'] = vader_scores['compound']
|
| 133 |
+
results['vader_detailed'] = vader_scores
|
| 134 |
+
except Exception as e:
|
| 135 |
+
logger.error(f"VADER analysis failed: {str(e)}")
|
| 136 |
+
results['vader'] = 0.0
|
| 137 |
+
|
| 138 |
+
# Loughran-McDonald Analysis
|
| 139 |
+
if 'Loughran-McDonald' in models and self.loughran_mcdonald_dict:
|
| 140 |
+
try:
|
| 141 |
+
lm_score = self._analyze_loughran_mcdonald(cleaned_text)
|
| 142 |
+
results['loughran_mcdonald'] = lm_score
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.error(f"Loughran-McDonald analysis failed: {str(e)}")
|
| 145 |
+
results['loughran_mcdonald'] = 0.0
|
| 146 |
+
|
| 147 |
+
# FinBERT Analysis
|
| 148 |
+
if 'FinBERT' in models and self.finbert_pipeline:
|
| 149 |
+
try:
|
| 150 |
+
# Truncate text for FinBERT (max 512 tokens)
|
| 151 |
+
truncated_text = cleaned_text[:2000] # Approximate token limit
|
| 152 |
+
finbert_result = self.finbert_pipeline(truncated_text)[0]
|
| 153 |
+
|
| 154 |
+
# Convert to numerical score
|
| 155 |
+
label = finbert_result['label'].lower()
|
| 156 |
+
confidence = finbert_result['score']
|
| 157 |
+
|
| 158 |
+
if label == 'positive':
|
| 159 |
+
finbert_score = confidence
|
| 160 |
+
elif label == 'negative':
|
| 161 |
+
finbert_score = -confidence
|
| 162 |
+
else: # neutral
|
| 163 |
+
finbert_score = 0.0
|
| 164 |
+
|
| 165 |
+
results['finbert'] = finbert_score
|
| 166 |
+
results['finbert_detailed'] = finbert_result
|
| 167 |
+
|
| 168 |
+
except Exception as e:
|
| 169 |
+
logger.error(f"FinBERT analysis failed: {str(e)}")
|
| 170 |
+
results['finbert'] = 0.0
|
| 171 |
+
|
| 172 |
+
# Calculate composite score
|
| 173 |
+
scores = []
|
| 174 |
+
weights = {'vader': 0.3, 'loughran_mcdonald': 0.4, 'finbert': 0.3}
|
| 175 |
+
|
| 176 |
+
for model in ['vader', 'loughran_mcdonald', 'finbert']:
|
| 177 |
+
if model in results:
|
| 178 |
+
scores.append(results[model] * weights[model])
|
| 179 |
+
|
| 180 |
+
results['compound'] = sum(scores) if scores else 0.0
|
| 181 |
+
|
| 182 |
+
return results
|
| 183 |
+
|
| 184 |
+
def _clean_text(self, text: str) -> str:
|
| 185 |
+
"""Clean text for sentiment analysis"""
|
| 186 |
+
if not text:
|
| 187 |
+
return ""
|
| 188 |
+
|
| 189 |
+
# Remove URLs
|
| 190 |
+
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
|
| 191 |
+
|
| 192 |
+
# Remove email addresses
|
| 193 |
+
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
|
| 194 |
+
|
| 195 |
+
# Remove extra whitespace
|
| 196 |
+
text = re.sub(r'\s+', ' ', text)
|
| 197 |
+
|
| 198 |
+
# Remove special characters but keep basic punctuation
|
| 199 |
+
text = re.sub(r'[^\w\s.,!?;:\-\'"()]', '', text)
|
| 200 |
+
|
| 201 |
+
return text.strip()
|
| 202 |
+
|
| 203 |
+
def _analyze_loughran_mcdonald(self, text: str) -> float:
|
| 204 |
+
"""Analyze sentiment using Loughran-McDonald dictionary"""
|
| 205 |
+
try:
|
| 206 |
+
words = word_tokenize(text.lower())
|
| 207 |
+
|
| 208 |
+
positive_count = sum(1 for word in words if word in self.loughran_mcdonald_dict['positive'])
|
| 209 |
+
negative_count = sum(1 for word in words if word in self.loughran_mcdonald_dict['negative'])
|
| 210 |
+
|
| 211 |
+
total_sentiment_words = positive_count + negative_count
|
| 212 |
+
|
| 213 |
+
if total_sentiment_words == 0:
|
| 214 |
+
return 0.0
|
| 215 |
+
|
| 216 |
+
# Calculate normalized score
|
| 217 |
+
score = (positive_count - negative_count) / len(words) * 10 # Scale factor
|
| 218 |
+
|
| 219 |
+
# Clamp to [-1, 1] range
|
| 220 |
+
return max(-1.0, min(1.0, score))
|
| 221 |
+
|
| 222 |
+
except Exception as e:
|
| 223 |
+
logger.error(f"Loughran-McDonald calculation error: {str(e)}")
|
| 224 |
+
return 0.0
|
| 225 |
+
|
| 226 |
+
class KeywordExtractor:
|
| 227 |
+
"""Extract important keywords from text using YAKE"""
|
| 228 |
+
|
| 229 |
+
def __init__(self):
|
| 230 |
+
self.stop_words = set()
|
| 231 |
+
try:
|
| 232 |
+
self.stop_words = set(stopwords.words('english'))
|
| 233 |
+
except:
|
| 234 |
+
# Fallback stop words
|
| 235 |
+
self.stop_words = {
|
| 236 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
| 237 |
+
'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have',
|
| 238 |
+
'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
|
| 239 |
+
'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
logger.info("KeywordExtractor initialized")
|
| 243 |
+
|
| 244 |
+
def extract_keywords(self, text: str, num_keywords: int = 20) -> List[Dict[str, Any]]:
|
| 245 |
+
"""Extract keywords using YAKE algorithm"""
|
| 246 |
+
try:
|
| 247 |
+
# Use YAKE if available
|
| 248 |
+
if 'yake' in globals():
|
| 249 |
+
return self._extract_with_yake(text, num_keywords)
|
| 250 |
+
else:
|
| 251 |
+
return self._extract_with_frequency(text, num_keywords)
|
| 252 |
+
|
| 253 |
+
except Exception as e:
|
| 254 |
+
logger.error(f"Keyword extraction failed: {str(e)}")
|
| 255 |
+
return []
|
| 256 |
+
|
| 257 |
+
def _extract_with_yake(self, text: str, num_keywords: int) -> List[Dict[str, Any]]:
|
| 258 |
+
"""Extract keywords using YAKE algorithm"""
|
| 259 |
+
try:
|
| 260 |
+
# YAKE configuration
|
| 261 |
+
kw_extractor = yake.KeywordExtractor(
|
| 262 |
+
lan="en",
|
| 263 |
+
n=3, # n-gram size
|
| 264 |
+
dedupLim=0.9,
|
| 265 |
+
top=num_keywords,
|
| 266 |
+
features=None
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
keywords = kw_extractor.extract_keywords(text)
|
| 270 |
+
|
| 271 |
+
# Convert to desired format (lower score = more relevant in YAKE)
|
| 272 |
+
result = []
|
| 273 |
+
for keyword, score in keywords:
|
| 274 |
+
result.append({
|
| 275 |
+
'keyword': keyword,
|
| 276 |
+
'score': 1.0 / (1.0 + score), # Invert score so higher = more relevant
|
| 277 |
+
'relevance': 'high' if score < 0.1 else 'medium' if score < 0.3 else 'low'
|
| 278 |
+
})
|
| 279 |
+
|
| 280 |
+
return result
|
| 281 |
+
|
| 282 |
+
except Exception as e:
|
| 283 |
+
logger.error(f"YAKE extraction failed: {str(e)}")
|
| 284 |
+
return self._extract_with_frequency(text, num_keywords)
|
| 285 |
+
|
| 286 |
+
def _extract_with_frequency(self, text: str, num_keywords: int) -> List[Dict[str, Any]]:
|
| 287 |
+
"""Fallback keyword extraction using frequency analysis"""
|
| 288 |
+
try:
|
| 289 |
+
# Clean and tokenize
|
| 290 |
+
words = word_tokenize(text.lower())
|
| 291 |
+
|
| 292 |
+
# Filter words
|
| 293 |
+
filtered_words = [
|
| 294 |
+
word for word in words
|
| 295 |
+
if (word not in self.stop_words and
|
| 296 |
+
word not in string.punctuation and
|
| 297 |
+
len(word) > 2 and
|
| 298 |
+
word.isalpha())
|
| 299 |
+
]
|
| 300 |
+
|
| 301 |
+
# Count frequencies
|
| 302 |
+
word_freq = Counter(filtered_words)
|
| 303 |
+
|
| 304 |
+
# Get top keywords
|
| 305 |
+
top_words = word_freq.most_common(num_keywords)
|
| 306 |
+
|
| 307 |
+
# Calculate relevance scores
|
| 308 |
+
max_freq = top_words[0][1] if top_words else 1
|
| 309 |
+
|
| 310 |
+
result = []
|
| 311 |
+
for word, freq in top_words:
|
| 312 |
+
score = freq / max_freq
|
| 313 |
+
result.append({
|
| 314 |
+
'keyword': word,
|
| 315 |
+
'score': score,
|
| 316 |
+
'relevance': 'high' if score > 0.7 else 'medium' if score > 0.3 else 'low'
|
| 317 |
+
})
|
| 318 |
+
|
| 319 |
+
return result
|
| 320 |
+
|
| 321 |
+
except Exception as e:
|
| 322 |
+
logger.error(f"Frequency extraction failed: {str(e)}")
|
| 323 |
+
return []
|
| 324 |
+
|
| 325 |
+
class TextProcessor:
|
| 326 |
+
"""Text preprocessing and cleaning utilities"""
|
| 327 |
+
|
| 328 |
+
def __init__(self):
|
| 329 |
+
self.stemmer = PorterStemmer()
|
| 330 |
+
logger.info("TextProcessor initialized")
|
| 331 |
+
|
| 332 |
+
def clean_article_content(self, content: str) -> str:
|
| 333 |
+
"""Clean article content by removing boilerplate"""
|
| 334 |
+
if not content:
|
| 335 |
+
return ""
|
| 336 |
+
|
| 337 |
+
# Remove common boilerplate patterns
|
| 338 |
+
boilerplate_patterns = [
|
| 339 |
+
r'Subscribe to our newsletter.*',
|
| 340 |
+
r'Sign up for.*',
|
| 341 |
+
r'Follow us on.*',
|
| 342 |
+
r'Copyright.*',
|
| 343 |
+
r'All rights reserved.*',
|
| 344 |
+
r'Terms of use.*',
|
| 345 |
+
r'Privacy policy.*',
|
| 346 |
+
r'Cookie policy.*',
|
| 347 |
+
r'\d+ comments?',
|
| 348 |
+
r'Share this article.*',
|
| 349 |
+
r'Related articles?.*',
|
| 350 |
+
r'More from.*',
|
| 351 |
+
r'Advertisement.*',
|
| 352 |
+
r'Sponsored content.*'
|
| 353 |
+
]
|
| 354 |
+
|
| 355 |
+
cleaned_content = content
|
| 356 |
+
for pattern in boilerplate_patterns:
|
| 357 |
+
cleaned_content = re.sub(pattern, '', cleaned_content, flags=re.IGNORECASE)
|
| 358 |
+
|
| 359 |
+
# Remove extra whitespace
|
| 360 |
+
cleaned_content = re.sub(r'\s+', ' ', cleaned_content)
|
| 361 |
+
|
| 362 |
+
# Remove very short sentences (likely navigation/boilerplate)
|
| 363 |
+
sentences = sent_tokenize(cleaned_content)
|
| 364 |
+
meaningful_sentences = [
|
| 365 |
+
sent for sent in sentences
|
| 366 |
+
if len(sent.split()) > 5 and not self._is_boilerplate_sentence(sent)
|
| 367 |
+
]
|
| 368 |
+
|
| 369 |
+
return ' '.join(meaningful_sentences).strip()
|
| 370 |
+
|
| 371 |
+
def _is_boilerplate_sentence(self, sentence: str) -> bool:
|
| 372 |
+
"""Check if sentence is likely boilerplate"""
|
| 373 |
+
boilerplate_indicators = [
|
| 374 |
+
'click here', 'read more', 'subscribe', 'follow us', 'contact us',
|
| 375 |
+
'terms of service', 'privacy policy', 'copyright', 'all rights reserved',
|
| 376 |
+
'advertisement', 'sponsored', 'related articles'
|
| 377 |
+
]
|
| 378 |
+
|
| 379 |
+
sentence_lower = sentence.lower()
|
| 380 |
+
return any(indicator in sentence_lower for indicator in boilerplate_indicators)
|
| 381 |
+
|
| 382 |
+
def extract_entities(self, text: str) -> Dict[str, List[str]]:
|
| 383 |
+
"""Extract named entities (companies, people, locations)"""
|
| 384 |
+
# Simple regex-based entity extraction
|
| 385 |
+
entities = {
|
| 386 |
+
'companies': [],
|
| 387 |
+
'people': [],
|
| 388 |
+
'locations': [],
|
| 389 |
+
'money': [],
|
| 390 |
+
'dates': []
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
try:
|
| 394 |
+
# Company patterns (simplified)
|
| 395 |
+
company_pattern = r'\b[A-Z][a-zA-Z]+ (?:Inc|Corp|LLC|Ltd|Company|Co)\b'
|
| 396 |
+
entities['companies'] = list(set(re.findall(company_pattern, text)))
|
| 397 |
+
|
| 398 |
+
# Money patterns
|
| 399 |
+
money_pattern = r'\$[\d,]+(?:\.\d{2})?(?:\s?(?:million|billion|trillion|k|M|B|T))?'
|
| 400 |
+
entities['money'] = list(set(re.findall(money_pattern, text)))
|
| 401 |
+
|
| 402 |
+
# Date patterns (simplified)
|
| 403 |
+
date_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}'
|
| 404 |
+
entities['dates'] = list(set(re.findall(date_pattern, text)))
|
| 405 |
+
|
| 406 |
+
except Exception as e:
|
| 407 |
+
logger.error(f"Entity extraction failed: {str(e)}")
|
| 408 |
+
|
| 409 |
+
return entities
|
| 410 |
+
|
| 411 |
+
def calculate_readability(self, text: str) -> Dict[str, float]:
|
| 412 |
+
"""Calculate text readability metrics"""
|
| 413 |
+
try:
|
| 414 |
+
sentences = sent_tokenize(text)
|
| 415 |
+
words = word_tokenize(text)
|
| 416 |
+
|
| 417 |
+
if not sentences or not words:
|
| 418 |
+
return {'flesch_score': 0.0, 'avg_sentence_length': 0.0, 'avg_word_length': 0.0}
|
| 419 |
+
|
| 420 |
+
# Basic metrics
|
| 421 |
+
num_sentences = len(sentences)
|
| 422 |
+
num_words = len(words)
|
| 423 |
+
num_syllables = sum(self._count_syllables(word) for word in words if word.isalpha())
|
| 424 |
+
|
| 425 |
+
# Average sentence length
|
| 426 |
+
avg_sentence_length = num_words / num_sentences
|
| 427 |
+
|
| 428 |
+
# Average word length
|
| 429 |
+
avg_word_length = sum(len(word) for word in words if word.isalpha()) / num_words
|
| 430 |
+
|
| 431 |
+
# Flesch Reading Ease Score (simplified)
|
| 432 |
+
flesch_score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * (num_syllables / num_words))
|
| 433 |
+
|
| 434 |
+
return {
|
| 435 |
+
'flesch_score': max(0.0, min(100.0, flesch_score)),
|
| 436 |
+
'avg_sentence_length': avg_sentence_length,
|
| 437 |
+
'avg_word_length': avg_word_length
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
except Exception as e:
|
| 441 |
+
logger.error(f"Readability calculation failed: {str(e)}")
|
| 442 |
+
return {'flesch_score': 0.0, 'avg_sentence_length': 0.0, 'avg_word_length': 0.0}
|
| 443 |
+
|
| 444 |
+
def _count_syllables(self, word: str) -> int:
|
| 445 |
+
"""Count syllables in a word (simplified)"""
|
| 446 |
+
word = word.lower()
|
| 447 |
+
vowels = 'aeiouy'
|
| 448 |
+
syllable_count = 0
|
| 449 |
+
prev_char_was_vowel = False
|
| 450 |
+
|
| 451 |
+
for char in word:
|
| 452 |
+
if char in vowels:
|
| 453 |
+
if not prev_char_was_vowel:
|
| 454 |
+
syllable_count += 1
|
| 455 |
+
prev_char_was_vowel = True
|
| 456 |
+
else:
|
| 457 |
+
prev_char_was_vowel = False
|
| 458 |
+
|
| 459 |
+
# Handle silent e
|
| 460 |
+
if word.endswith('e'):
|
| 461 |
+
syllable_count -= 1
|
| 462 |
+
|
| 463 |
+
# Every word has at least one syllable
|
| 464 |
+
return max(1, syllable_count)
|
report_module (1).py
ADDED
|
@@ -0,0 +1,606 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Dict, List, Any, Optional
|
| 3 |
+
import io
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import base64
|
| 6 |
+
|
| 7 |
+
# PDF generation
|
| 8 |
+
try:
|
| 9 |
+
from reportlab.lib.pagesizes import letter, A4
|
| 10 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image
|
| 11 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 12 |
+
from reportlab.lib.units import inch
|
| 13 |
+
from reportlab.lib import colors
|
| 14 |
+
from reportlab.graphics.shapes import Drawing
|
| 15 |
+
from reportlab.graphics.charts.piecharts import Pie
|
| 16 |
+
from reportlab.graphics.charts.barcharts import VerticalBarChart
|
| 17 |
+
REPORTLAB_AVAILABLE = True
|
| 18 |
+
except ImportError:
|
| 19 |
+
REPORTLAB_AVAILABLE = False
|
| 20 |
+
|
| 21 |
+
# Plotting for charts in PDF
|
| 22 |
+
try:
|
| 23 |
+
import matplotlib.pyplot as plt
|
| 24 |
+
import matplotlib
|
| 25 |
+
matplotlib.use('Agg') # Use non-interactive backend
|
| 26 |
+
MATPLOTLIB_AVAILABLE = True
|
| 27 |
+
except ImportError:
|
| 28 |
+
MATPLOTLIB_AVAILABLE = False
|
| 29 |
+
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
def generate_pdf_report(results: Dict[str, Any]) -> io.BytesIO:
|
| 33 |
+
"""Generate a comprehensive PDF report"""
|
| 34 |
+
if not REPORTLAB_AVAILABLE:
|
| 35 |
+
logger.error("ReportLab not available for PDF generation")
|
| 36 |
+
return _generate_simple_pdf_fallback(results)
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
# Create PDF buffer
|
| 40 |
+
buffer = io.BytesIO()
|
| 41 |
+
|
| 42 |
+
# Create document
|
| 43 |
+
doc = SimpleDocTemplate(
|
| 44 |
+
buffer,
|
| 45 |
+
pagesize=A4,
|
| 46 |
+
rightMargin=72,
|
| 47 |
+
leftMargin=72,
|
| 48 |
+
topMargin=72,
|
| 49 |
+
bottomMargin=18
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Get styles
|
| 53 |
+
styles = getSampleStyleSheet()
|
| 54 |
+
|
| 55 |
+
# Create custom styles
|
| 56 |
+
title_style = ParagraphStyle(
|
| 57 |
+
'CustomTitle',
|
| 58 |
+
parent=styles['Heading1'],
|
| 59 |
+
fontSize=24,
|
| 60 |
+
spaceAfter=30,
|
| 61 |
+
textColor=colors.HexColor('#2E86AB'),
|
| 62 |
+
alignment=1 # Center
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
heading_style = ParagraphStyle(
|
| 66 |
+
'CustomHeading',
|
| 67 |
+
parent=styles['Heading2'],
|
| 68 |
+
fontSize=16,
|
| 69 |
+
spaceAfter=12,
|
| 70 |
+
spaceBefore=20,
|
| 71 |
+
textColor=colors.HexColor('#2E86AB')
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Build story (content)
|
| 75 |
+
story = []
|
| 76 |
+
|
| 77 |
+
# Title page
|
| 78 |
+
story.append(Paragraph("Global Business News Intelligence Report", title_style))
|
| 79 |
+
story.append(Spacer(1, 0.5*inch))
|
| 80 |
+
|
| 81 |
+
# Query and basic info
|
| 82 |
+
story.append(Paragraph(f"Analysis Target: {results.get('query', 'N/A')}", styles['Normal']))
|
| 83 |
+
story.append(Paragraph(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles['Normal']))
|
| 84 |
+
story.append(Paragraph(f"Total Articles Analyzed: {results.get('total_articles', 0)}", styles['Normal']))
|
| 85 |
+
story.append(Paragraph(f"Processing Time: {results.get('processing_time', 0):.2f} seconds", styles['Normal']))
|
| 86 |
+
story.append(Spacer(1, 0.3*inch))
|
| 87 |
+
|
| 88 |
+
# Executive Summary
|
| 89 |
+
story.append(Paragraph("Executive Summary", heading_style))
|
| 90 |
+
summary_text = _create_executive_summary(results)
|
| 91 |
+
story.append(Paragraph(summary_text, styles['Normal']))
|
| 92 |
+
story.append(Spacer(1, 0.2*inch))
|
| 93 |
+
|
| 94 |
+
# Sentiment Analysis Section
|
| 95 |
+
story.append(Paragraph("Sentiment Analysis", heading_style))
|
| 96 |
+
sentiment_data = _create_sentiment_section(results, styles)
|
| 97 |
+
story.extend(sentiment_data)
|
| 98 |
+
|
| 99 |
+
# Top Stories Section
|
| 100 |
+
story.append(Paragraph("Key Stories", heading_style))
|
| 101 |
+
stories_data = _create_stories_section(results, styles)
|
| 102 |
+
story.extend(stories_data)
|
| 103 |
+
|
| 104 |
+
# Keywords Section
|
| 105 |
+
if 'keywords' in results and results['keywords']:
|
| 106 |
+
story.append(Paragraph("Key Topics and Themes", heading_style))
|
| 107 |
+
keywords_data = _create_keywords_section(results, styles)
|
| 108 |
+
story.extend(keywords_data)
|
| 109 |
+
|
| 110 |
+
# Sources Section
|
| 111 |
+
story.append(Paragraph("News Sources", heading_style))
|
| 112 |
+
sources_data = _create_sources_section(results, styles)
|
| 113 |
+
story.extend(sources_data)
|
| 114 |
+
|
| 115 |
+
# Methodology Section
|
| 116 |
+
story.append(Paragraph("Methodology", heading_style))
|
| 117 |
+
methodology_text = _create_methodology_section(results)
|
| 118 |
+
story.append(Paragraph(methodology_text, styles['Normal']))
|
| 119 |
+
|
| 120 |
+
# Build PDF
|
| 121 |
+
doc.build(story)
|
| 122 |
+
|
| 123 |
+
buffer.seek(0)
|
| 124 |
+
return buffer
|
| 125 |
+
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"PDF generation failed: {str(e)}")
|
| 128 |
+
return _generate_simple_pdf_fallback(results)
|
| 129 |
+
|
| 130 |
+
def _create_executive_summary(results: Dict[str, Any]) -> str:
|
| 131 |
+
"""Create executive summary text"""
|
| 132 |
+
try:
|
| 133 |
+
query = results.get('query', 'the analyzed topic')
|
| 134 |
+
total_articles = results.get('total_articles', 0)
|
| 135 |
+
avg_sentiment = results.get('average_sentiment', 0)
|
| 136 |
+
|
| 137 |
+
sentiment_label = "positive" if avg_sentiment > 0.1 else "negative" if avg_sentiment < -0.1 else "neutral"
|
| 138 |
+
|
| 139 |
+
summary = f"This report analyzes {total_articles} news articles related to {query}. "
|
| 140 |
+
summary += f"The overall sentiment analysis reveals a {sentiment_label} tone with an average sentiment score of {avg_sentiment:.3f}. "
|
| 141 |
+
|
| 142 |
+
# Add sentiment distribution
|
| 143 |
+
dist = results.get('sentiment_distribution', {})
|
| 144 |
+
positive = dist.get('Positive', 0)
|
| 145 |
+
negative = dist.get('Negative', 0)
|
| 146 |
+
neutral = dist.get('Neutral', 0)
|
| 147 |
+
|
| 148 |
+
summary += f"The analysis shows {positive} positive articles ({positive/total_articles*100:.1f}%), "
|
| 149 |
+
summary += f"{negative} negative articles ({negative/total_articles*100:.1f}%), "
|
| 150 |
+
summary += f"and {neutral} neutral articles ({neutral/total_articles*100:.1f}%). "
|
| 151 |
+
|
| 152 |
+
# Add key insights
|
| 153 |
+
if avg_sentiment > 0.2:
|
| 154 |
+
summary += "The predominantly positive coverage suggests favorable market conditions or public perception."
|
| 155 |
+
elif avg_sentiment < -0.2:
|
| 156 |
+
summary += "The predominantly negative coverage indicates concerns or challenges that may require attention."
|
| 157 |
+
else:
|
| 158 |
+
summary += "The balanced sentiment coverage suggests a mixed outlook with both opportunities and challenges present."
|
| 159 |
+
|
| 160 |
+
return summary
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.error(f"Executive summary creation failed: {str(e)}")
|
| 164 |
+
return "Analysis completed successfully with comprehensive sentiment evaluation across multiple news sources."
|
| 165 |
+
|
| 166 |
+
def _create_sentiment_section(results: Dict[str, Any], styles) -> List:
|
| 167 |
+
"""Create sentiment analysis section"""
|
| 168 |
+
story = []
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
# Sentiment distribution table
|
| 172 |
+
dist = results.get('sentiment_distribution', {})
|
| 173 |
+
sentiment_data = [
|
| 174 |
+
['Sentiment', 'Count', 'Percentage'],
|
| 175 |
+
['Positive', str(dist.get('Positive', 0)), f"{dist.get('Positive', 0)/results.get('total_articles', 1)*100:.1f}%"],
|
| 176 |
+
['Negative', str(dist.get('Negative', 0)), f"{dist.get('Negative', 0)/results.get('total_articles', 1)*100:.1f}%"],
|
| 177 |
+
['Neutral', str(dist.get('Neutral', 0)), f"{dist.get('Neutral', 0)/results.get('total_articles', 1)*100:.1f}%"]
|
| 178 |
+
]
|
| 179 |
+
|
| 180 |
+
sentiment_table = Table(sentiment_data)
|
| 181 |
+
sentiment_table.setStyle(TableStyle([
|
| 182 |
+
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
|
| 183 |
+
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
| 184 |
+
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
| 185 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 186 |
+
('FONTSIZE', (0, 0), (-1, 0), 12),
|
| 187 |
+
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
| 188 |
+
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
| 189 |
+
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
| 190 |
+
]))
|
| 191 |
+
|
| 192 |
+
story.append(sentiment_table)
|
| 193 |
+
story.append(Spacer(1, 0.2*inch))
|
| 194 |
+
|
| 195 |
+
# Add sentiment analysis explanation
|
| 196 |
+
explanation = "Sentiment analysis was performed using multiple models including VADER, Loughran-McDonald financial dictionary, and FinBERT. "
|
| 197 |
+
explanation += "Scores range from -1.0 (most negative) to +1.0 (most positive), with scores between -0.1 and +0.1 considered neutral."
|
| 198 |
+
|
| 199 |
+
story.append(Paragraph(explanation, styles['Normal']))
|
| 200 |
+
story.append(Spacer(1, 0.2*inch))
|
| 201 |
+
|
| 202 |
+
except Exception as e:
|
| 203 |
+
logger.error(f"Sentiment section creation failed: {str(e)}")
|
| 204 |
+
story.append(Paragraph("Sentiment analysis data unavailable.", styles['Normal']))
|
| 205 |
+
|
| 206 |
+
return story
|
| 207 |
+
|
| 208 |
+
def _create_stories_section(results: Dict[str, Any], styles) -> List:
|
| 209 |
+
"""Create top stories section"""
|
| 210 |
+
story = []
|
| 211 |
+
|
| 212 |
+
try:
|
| 213 |
+
articles = results.get('articles', [])
|
| 214 |
+
if not articles:
|
| 215 |
+
story.append(Paragraph("No articles available for analysis.", styles['Normal']))
|
| 216 |
+
return story
|
| 217 |
+
|
| 218 |
+
# Sort articles by sentiment score
|
| 219 |
+
sorted_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0), reverse=True)
|
| 220 |
+
|
| 221 |
+
# Most positive story
|
| 222 |
+
if sorted_articles and sorted_articles[0].get('sentiment', {}).get('compound', 0) > 0.1:
|
| 223 |
+
story.append(Paragraph("Most Positive Coverage:", styles['Heading3']))
|
| 224 |
+
top_positive = sorted_articles[0]
|
| 225 |
+
story.append(Paragraph(f"<b>Title:</b> {top_positive.get('title', 'N/A')}", styles['Normal']))
|
| 226 |
+
story.append(Paragraph(f"<b>Source:</b> {top_positive.get('source', 'N/A')}", styles['Normal']))
|
| 227 |
+
story.append(Paragraph(f"<b>Sentiment Score:</b> {top_positive.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
|
| 228 |
+
if 'summary' in top_positive:
|
| 229 |
+
story.append(Paragraph(f"<b>Summary:</b> {top_positive['summary'][:300]}...", styles['Normal']))
|
| 230 |
+
story.append(Spacer(1, 0.2*inch))
|
| 231 |
+
|
| 232 |
+
# Most negative story
|
| 233 |
+
negative_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0))
|
| 234 |
+
if negative_articles and negative_articles[0].get('sentiment', {}).get('compound', 0) < -0.1:
|
| 235 |
+
story.append(Paragraph("Most Negative Coverage:", styles['Heading3']))
|
| 236 |
+
top_negative = negative_articles[0]
|
| 237 |
+
story.append(Paragraph(f"<b>Title:</b> {top_negative.get('title', 'N/A')}", styles['Normal']))
|
| 238 |
+
story.append(Paragraph(f"<b>Source:</b> {top_negative.get('source', 'N/A')}", styles['Normal']))
|
| 239 |
+
story.append(Paragraph(f"<b>Sentiment Score:</b> {top_negative.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
|
| 240 |
+
if 'summary' in top_negative:
|
| 241 |
+
story.append(Paragraph(f"<b>Summary:</b> {top_negative['summary'][:300]}...", styles['Normal']))
|
| 242 |
+
story.append(Spacer(1, 0.2*inch))
|
| 243 |
+
|
| 244 |
+
# Recent stories (if dates available)
|
| 245 |
+
recent_articles = [a for a in articles if a.get('date')]
|
| 246 |
+
if recent_articles:
|
| 247 |
+
recent_articles.sort(key=lambda x: x.get('date', ''), reverse=True)
|
| 248 |
+
story.append(Paragraph("Most Recent Coverage:", styles['Heading3']))
|
| 249 |
+
recent = recent_articles[0]
|
| 250 |
+
story.append(Paragraph(f"<b>Title:</b> {recent.get('title', 'N/A')}", styles['Normal']))
|
| 251 |
+
story.append(Paragraph(f"<b>Source:</b> {recent.get('source', 'N/A')}", styles['Normal']))
|
| 252 |
+
story.append(Paragraph(f"<b>Date:</b> {recent.get('date', 'N/A')}", styles['Normal']))
|
| 253 |
+
story.append(Paragraph(f"<b>Sentiment Score:</b> {recent.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
|
| 254 |
+
|
| 255 |
+
except Exception as e:
|
| 256 |
+
logger.error(f"Stories section creation failed: {str(e)}")
|
| 257 |
+
story.append(Paragraph("Story analysis data unavailable.", styles['Normal']))
|
| 258 |
+
|
| 259 |
+
return story
|
| 260 |
+
|
| 261 |
+
def _create_keywords_section(results: Dict[str, Any], styles) -> List:
|
| 262 |
+
"""Create keywords section"""
|
| 263 |
+
story = []
|
| 264 |
+
|
| 265 |
+
try:
|
| 266 |
+
keywords = results.get('keywords', [])[:15] # Top 15 keywords
|
| 267 |
+
|
| 268 |
+
if not keywords:
|
| 269 |
+
story.append(Paragraph("No keywords extracted.", styles['Normal']))
|
| 270 |
+
return story
|
| 271 |
+
|
| 272 |
+
# Create keywords table
|
| 273 |
+
keyword_data = [['Keyword', 'Relevance Score', 'Category']]
|
| 274 |
+
|
| 275 |
+
for kw in keywords:
|
| 276 |
+
relevance = kw.get('relevance', 'medium')
|
| 277 |
+
score = kw.get('score', 0)
|
| 278 |
+
keyword_data.append([
|
| 279 |
+
kw.get('keyword', 'N/A'),
|
| 280 |
+
f"{score:.3f}",
|
| 281 |
+
relevance.title()
|
| 282 |
+
])
|
| 283 |
+
|
| 284 |
+
keyword_table = Table(keyword_data)
|
| 285 |
+
keyword_table.setStyle(TableStyle([
|
| 286 |
+
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
|
| 287 |
+
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
| 288 |
+
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
| 289 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 290 |
+
('FONTSIZE', (0, 0), (-1, 0), 10),
|
| 291 |
+
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
| 292 |
+
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
| 293 |
+
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
| 294 |
+
]))
|
| 295 |
+
|
| 296 |
+
story.append(keyword_table)
|
| 297 |
+
story.append(Spacer(1, 0.2*inch))
|
| 298 |
+
|
| 299 |
+
# Keywords explanation
|
| 300 |
+
explanation = "Keywords were extracted using the YAKE (Yet Another Keyword Extractor) algorithm, "
|
| 301 |
+
explanation += "which identifies the most relevant terms and phrases based on statistical analysis of the text corpus."
|
| 302 |
+
|
| 303 |
+
story.append(Paragraph(explanation, styles['Normal']))
|
| 304 |
+
|
| 305 |
+
except Exception as e:
|
| 306 |
+
logger.error(f"Keywords section creation failed: {str(e)}")
|
| 307 |
+
story.append(Paragraph("Keyword analysis data unavailable.", styles['Normal']))
|
| 308 |
+
|
| 309 |
+
return story
|
| 310 |
+
|
| 311 |
+
def _create_sources_section(results: Dict[str, Any], styles) -> List:
|
| 312 |
+
"""Create news sources section"""
|
| 313 |
+
story = []
|
| 314 |
+
|
| 315 |
+
try:
|
| 316 |
+
articles = results.get('articles', [])
|
| 317 |
+
|
| 318 |
+
if not articles:
|
| 319 |
+
story.append(Paragraph("No source data available.", styles['Normal']))
|
| 320 |
+
return story
|
| 321 |
+
|
| 322 |
+
# Count sources
|
| 323 |
+
source_counts = {}
|
| 324 |
+
for article in articles:
|
| 325 |
+
source = article.get('source', 'Unknown')
|
| 326 |
+
source_counts[source] = source_counts.get(source, 0) + 1
|
| 327 |
+
|
| 328 |
+
# Create sources table
|
| 329 |
+
source_data = [['News Source', 'Article Count', 'Percentage']]
|
| 330 |
+
total_articles = len(articles)
|
| 331 |
+
|
| 332 |
+
for source, count in sorted(source_counts.items(), key=lambda x: x[1], reverse=True):
|
| 333 |
+
percentage = (count / total_articles) * 100
|
| 334 |
+
source_data.append([source, str(count), f"{percentage:.1f}%"])
|
| 335 |
+
|
| 336 |
+
sources_table = Table(source_data)
|
| 337 |
+
sources_table.setStyle(TableStyle([
|
| 338 |
+
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
|
| 339 |
+
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
| 340 |
+
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
| 341 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 342 |
+
('FONTSIZE', (0, 0), (-1, 0), 10),
|
| 343 |
+
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
| 344 |
+
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
| 345 |
+
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
| 346 |
+
]))
|
| 347 |
+
|
| 348 |
+
story.append(sources_table)
|
| 349 |
+
story.append(Spacer(1, 0.2*inch))
|
| 350 |
+
|
| 351 |
+
# Sources explanation
|
| 352 |
+
explanation = f"Articles were collected from {len(source_counts)} different news sources, "
|
| 353 |
+
explanation += "providing diverse perspectives on the analyzed topic. Source diversity helps ensure comprehensive coverage and reduces bias."
|
| 354 |
+
|
| 355 |
+
story.append(Paragraph(explanation, styles['Normal']))
|
| 356 |
+
|
| 357 |
+
except Exception as e:
|
| 358 |
+
logger.error(f"Sources section creation failed: {str(e)}")
|
| 359 |
+
story.append(Paragraph("Source analysis data unavailable.", styles['Normal']))
|
| 360 |
+
|
| 361 |
+
return story
|
| 362 |
+
|
| 363 |
+
def _create_methodology_section(results: Dict[str, Any]) -> str:
|
| 364 |
+
"""Create methodology section text"""
|
| 365 |
+
methodology = "This analysis employed a comprehensive natural language processing pipeline:\n\n"
|
| 366 |
+
|
| 367 |
+
methodology += "1. <b>Data Collection:</b> News articles were scraped from multiple reliable sources using RSS feeds and web scraping techniques. "
|
| 368 |
+
methodology += "Content was filtered for relevance and deduplicated to ensure quality.\n\n"
|
| 369 |
+
|
| 370 |
+
methodology += "2. <b>Sentiment Analysis:</b> Three complementary models were used: "
|
| 371 |
+
methodology += "VADER (general sentiment), Loughran-McDonald dictionary (financial sentiment), and FinBERT (financial domain-specific). "
|
| 372 |
+
methodology += "Final scores represent a weighted combination of all models.\n\n"
|
| 373 |
+
|
| 374 |
+
methodology += "3. <b>Text Processing:</b> Articles were cleaned, summarized using transformer models, and analyzed for key themes. "
|
| 375 |
+
methodology += "Keyword extraction employed the YAKE algorithm for statistical relevance.\n\n"
|
| 376 |
+
|
| 377 |
+
methodology += "4. <b>Quality Assurance:</b> All content was filtered for English language, minimum length requirements, and relevance to the query terms. "
|
| 378 |
+
methodology += "Results were validated across multiple model outputs for consistency.\n\n"
|
| 379 |
+
|
| 380 |
+
if results.get('processing_time'):
|
| 381 |
+
methodology += f"Total processing time: {results['processing_time']:.2f} seconds for {results.get('total_articles', 0)} articles."
|
| 382 |
+
|
| 383 |
+
return methodology
|
| 384 |
+
|
| 385 |
+
def _generate_simple_pdf_fallback(results: Dict[str, Any]) -> io.BytesIO:
|
| 386 |
+
"""Generate a simple text-based PDF fallback"""
|
| 387 |
+
try:
|
| 388 |
+
from fpdf import FPDF
|
| 389 |
+
|
| 390 |
+
pdf = FPDF()
|
| 391 |
+
pdf.add_page()
|
| 392 |
+
pdf.set_font('Arial', 'B', 16)
|
| 393 |
+
pdf.cell(40, 10, 'News Analysis Report')
|
| 394 |
+
pdf.ln(20)
|
| 395 |
+
|
| 396 |
+
pdf.set_font('Arial', '', 12)
|
| 397 |
+
pdf.cell(40, 10, f"Query: {results.get('query', 'N/A')}")
|
| 398 |
+
pdf.ln(10)
|
| 399 |
+
pdf.cell(40, 10, f"Articles: {results.get('total_articles', 0)}")
|
| 400 |
+
pdf.ln(10)
|
| 401 |
+
pdf.cell(40, 10, f"Average Sentiment: {results.get('average_sentiment', 0):.3f}")
|
| 402 |
+
pdf.ln(20)
|
| 403 |
+
|
| 404 |
+
# Simple sentiment distribution
|
| 405 |
+
dist = results.get('sentiment_distribution', {})
|
| 406 |
+
pdf.cell(40, 10, 'Sentiment Distribution:')
|
| 407 |
+
pdf.ln(10)
|
| 408 |
+
pdf.cell(40, 10, f"Positive: {dist.get('Positive', 0)}")
|
| 409 |
+
pdf.ln(10)
|
| 410 |
+
pdf.cell(40, 10, f"Negative: {dist.get('Negative', 0)}")
|
| 411 |
+
pdf.ln(10)
|
| 412 |
+
pdf.cell(40, 10, f"Neutral: {dist.get('Neutral', 0)}")
|
| 413 |
+
|
| 414 |
+
# Save to buffer
|
| 415 |
+
buffer = io.BytesIO()
|
| 416 |
+
pdf_string = pdf.output(dest='S').encode('latin1')
|
| 417 |
+
buffer.write(pdf_string)
|
| 418 |
+
buffer.seek(0)
|
| 419 |
+
|
| 420 |
+
return buffer
|
| 421 |
+
|
| 422 |
+
except Exception as e:
|
| 423 |
+
logger.error(f"PDF fallback failed: {str(e)}")
|
| 424 |
+
# Return empty buffer as last resort
|
| 425 |
+
buffer = io.BytesIO()
|
| 426 |
+
buffer.write(b"PDF generation failed. Please check logs.")
|
| 427 |
+
buffer.seek(0)
|
| 428 |
+
return buffer
|
| 429 |
+
|
| 430 |
+
def create_chart_image(data: Dict, chart_type: str = 'pie') -> Optional[str]:
|
| 431 |
+
"""Create a chart image for PDF inclusion"""
|
| 432 |
+
if not MATPLOTLIB_AVAILABLE:
|
| 433 |
+
return None
|
| 434 |
+
|
| 435 |
+
try:
|
| 436 |
+
plt.figure(figsize=(6, 4))
|
| 437 |
+
|
| 438 |
+
if chart_type == 'pie' and 'sentiment_distribution' in data:
|
| 439 |
+
dist = data['sentiment_distribution']
|
| 440 |
+
labels = ['Positive', 'Negative', 'Neutral']
|
| 441 |
+
sizes = [dist.get('Positive', 0), dist.get('Negative', 0), dist.get('Neutral', 0)]
|
| 442 |
+
colors = ['#28a745', '#dc3545', '#6c757d']
|
| 443 |
+
|
| 444 |
+
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
|
| 445 |
+
plt.title('Sentiment Distribution')
|
| 446 |
+
|
| 447 |
+
elif chart_type == 'bar' and 'articles' in data:
|
| 448 |
+
articles = data['articles']
|
| 449 |
+
sources = {}
|
| 450 |
+
for article in articles:
|
| 451 |
+
source = article.get('source', 'Unknown')
|
| 452 |
+
sources[source] = sources.get(source, 0) + 1
|
| 453 |
+
|
| 454 |
+
# Top 10 sources
|
| 455 |
+
top_sources = dict(sorted(sources.items(), key=lambda x: x[1], reverse=True)[:10])
|
| 456 |
+
|
| 457 |
+
plt.bar(range(len(top_sources)), list(top_sources.values()), color='#2E86AB')
|
| 458 |
+
plt.xticks(range(len(top_sources)), list(top_sources.keys()), rotation=45, ha='right')
|
| 459 |
+
plt.title('Articles by Source')
|
| 460 |
+
plt.ylabel('Article Count')
|
| 461 |
+
plt.tight_layout()
|
| 462 |
+
|
| 463 |
+
# Save to base64 string
|
| 464 |
+
buffer = io.BytesIO()
|
| 465 |
+
plt.savefig(buffer, format='png', dpi=150, bbox_inches='tight')
|
| 466 |
+
buffer.seek(0)
|
| 467 |
+
|
| 468 |
+
image_base64 = base64.b64encode(buffer.getvalue()).decode()
|
| 469 |
+
plt.close()
|
| 470 |
+
|
| 471 |
+
return image_base64
|
| 472 |
+
|
| 473 |
+
except Exception as e:
|
| 474 |
+
logger.error(f"Chart creation failed: {str(e)}")
|
| 475 |
+
return None
|
| 476 |
+
|
| 477 |
+
def generate_csv_report(results: Dict[str, Any]) -> str:
|
| 478 |
+
"""Generate CSV report"""
|
| 479 |
+
try:
|
| 480 |
+
import csv
|
| 481 |
+
import io
|
| 482 |
+
|
| 483 |
+
output = io.StringIO()
|
| 484 |
+
writer = csv.writer(output)
|
| 485 |
+
|
| 486 |
+
# Write header
|
| 487 |
+
writer.writerow([
|
| 488 |
+
'Title', 'Source', 'URL', 'Date', 'Sentiment_Score', 'Sentiment_Label',
|
| 489 |
+
'VADER_Score', 'LM_Score', 'FinBERT_Score', 'Summary'
|
| 490 |
+
])
|
| 491 |
+
|
| 492 |
+
# Write article data
|
| 493 |
+
articles = results.get('articles', [])
|
| 494 |
+
for article in articles:
|
| 495 |
+
sentiment = article.get('sentiment', {})
|
| 496 |
+
compound = sentiment.get('compound', 0)
|
| 497 |
+
|
| 498 |
+
# Determine sentiment label
|
| 499 |
+
if compound > 0.1:
|
| 500 |
+
label = 'Positive'
|
| 501 |
+
elif compound < -0.1:
|
| 502 |
+
label = 'Negative'
|
| 503 |
+
else:
|
| 504 |
+
label = 'Neutral'
|
| 505 |
+
|
| 506 |
+
writer.writerow([
|
| 507 |
+
article.get('title', ''),
|
| 508 |
+
article.get('source', ''),
|
| 509 |
+
article.get('url', ''),
|
| 510 |
+
article.get('date', ''),
|
| 511 |
+
compound,
|
| 512 |
+
label,
|
| 513 |
+
sentiment.get('vader', ''),
|
| 514 |
+
sentiment.get('loughran_mcdonald', ''),
|
| 515 |
+
sentiment.get('finbert', ''),
|
| 516 |
+
article.get('summary', '')[:200] + '...' if len(article.get('summary', '')) > 200 else article.get('summary', '')
|
| 517 |
+
])
|
| 518 |
+
|
| 519 |
+
return output.getvalue()
|
| 520 |
+
|
| 521 |
+
except Exception as e:
|
| 522 |
+
logger.error(f"CSV generation failed: {str(e)}")
|
| 523 |
+
return "Error generating CSV report"
|
| 524 |
+
|
| 525 |
+
def generate_json_report(results: Dict[str, Any]) -> str:
|
| 526 |
+
"""Generate JSON report with formatted output"""
|
| 527 |
+
try:
|
| 528 |
+
import json
|
| 529 |
+
from datetime import datetime
|
| 530 |
+
|
| 531 |
+
# Create comprehensive report
|
| 532 |
+
report = {
|
| 533 |
+
'metadata': {
|
| 534 |
+
'report_generated': datetime.now().isoformat(),
|
| 535 |
+
'query': results.get('query', ''),
|
| 536 |
+
'total_articles': results.get('total_articles', 0),
|
| 537 |
+
'processing_time_seconds': results.get('processing_time', 0),
|
| 538 |
+
'languages': results.get('languages', ['English'])
|
| 539 |
+
},
|
| 540 |
+
'summary': {
|
| 541 |
+
'average_sentiment': results.get('average_sentiment', 0),
|
| 542 |
+
'sentiment_distribution': results.get('sentiment_distribution', {}),
|
| 543 |
+
'top_sources': _get_top_sources(results),
|
| 544 |
+
'date_range': results.get('summary', {}).get('date_range', {})
|
| 545 |
+
},
|
| 546 |
+
'articles': results.get('articles', []),
|
| 547 |
+
'keywords': results.get('keywords', [])[:20], # Top 20 keywords
|
| 548 |
+
'analysis_methods': {
|
| 549 |
+
'sentiment_models': ['VADER', 'Loughran-McDonald', 'FinBERT'],
|
| 550 |
+
'summarization_model': 'DistilBART',
|
| 551 |
+
'keyword_extraction': 'YAKE',
|
| 552 |
+
'translation_models': ['Helsinki-NLP Opus-MT']
|
| 553 |
+
}
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
return json.dumps(report, indent=2, default=str, ensure_ascii=False)
|
| 557 |
+
|
| 558 |
+
except Exception as e:
|
| 559 |
+
logger.error(f"JSON generation failed: {str(e)}")
|
| 560 |
+
return json.dumps({'error': str(e)}, indent=2)
|
| 561 |
+
|
| 562 |
+
def _get_top_sources(results: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 563 |
+
"""Get top news sources from results"""
|
| 564 |
+
try:
|
| 565 |
+
articles = results.get('articles', [])
|
| 566 |
+
sources = {}
|
| 567 |
+
|
| 568 |
+
for article in articles:
|
| 569 |
+
source = article.get('source', 'Unknown')
|
| 570 |
+
sources[source] = sources.get(source, 0) + 1
|
| 571 |
+
|
| 572 |
+
# Convert to list and sort
|
| 573 |
+
source_list = [
|
| 574 |
+
{'source': source, 'count': count, 'percentage': round((count / len(articles)) * 100, 1)}
|
| 575 |
+
for source, count in sources.items()
|
| 576 |
+
]
|
| 577 |
+
|
| 578 |
+
return sorted(source_list, key=lambda x: x['count'], reverse=True)[:10]
|
| 579 |
+
|
| 580 |
+
except Exception as e:
|
| 581 |
+
logger.error(f"Top sources calculation failed: {str(e)}")
|
| 582 |
+
return []
|
| 583 |
+
|
| 584 |
+
def validate_report_data(results: Dict[str, Any]) -> bool:
|
| 585 |
+
"""Validate that results contain required data for reporting"""
|
| 586 |
+
required_keys = ['query', 'articles', 'total_articles']
|
| 587 |
+
|
| 588 |
+
for key in required_keys:
|
| 589 |
+
if key not in results:
|
| 590 |
+
logger.error(f"Missing required key for reporting: {key}")
|
| 591 |
+
return False
|
| 592 |
+
|
| 593 |
+
if not isinstance(results['articles'], list) or len(results['articles']) == 0:
|
| 594 |
+
logger.error("No articles available for reporting")
|
| 595 |
+
return False
|
| 596 |
+
|
| 597 |
+
return True
|
| 598 |
+
|
| 599 |
+
# Export functions
|
| 600 |
+
__all__ = [
|
| 601 |
+
'generate_pdf_report',
|
| 602 |
+
'generate_csv_report',
|
| 603 |
+
'generate_json_report',
|
| 604 |
+
'create_chart_image',
|
| 605 |
+
'validate_report_data'
|
| 606 |
+
]
|
requirements_file.txt
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Framework
|
| 2 |
+
streamlit==1.28.1
|
| 3 |
+
fastapi==0.104.1
|
| 4 |
+
uvicorn==0.24.0
|
| 5 |
+
|
| 6 |
+
# Web Scraping & RSS
|
| 7 |
+
requests==2.31.0
|
| 8 |
+
beautifulsoup4==4.12.2
|
| 9 |
+
feedparser==6.0.10
|
| 10 |
+
trafilatura==1.6.2
|
| 11 |
+
lxml==4.9.3
|
| 12 |
+
|
| 13 |
+
# NLP & Machine Learning
|
| 14 |
+
transformers==4.35.2
|
| 15 |
+
torch==2.1.0 --index-url https://download.pytorch.org/whl/cpu
|
| 16 |
+
nltk==3.8.1
|
| 17 |
+
langdetect==1.0.9
|
| 18 |
+
yake==0.4.8
|
| 19 |
+
vaderSentiment==3.3.2
|
| 20 |
+
|
| 21 |
+
# Data Processing
|
| 22 |
+
pandas==2.0.3
|
| 23 |
+
numpy==1.24.3
|
| 24 |
+
|
| 25 |
+
# Visualization
|
| 26 |
+
plotly==5.17.0
|
| 27 |
+
matplotlib==3.7.2
|
| 28 |
+
wordcloud==1.9.2
|
| 29 |
+
|
| 30 |
+
# Translation & Audio
|
| 31 |
+
gtts==2.4.0
|
| 32 |
+
|
| 33 |
+
# Report Generation
|
| 34 |
+
reportlab==4.0.4
|
| 35 |
+
fpdf2==2.7.6
|
| 36 |
+
|
| 37 |
+
# Utilities
|
| 38 |
+
python-dotenv==1.0.0
|
| 39 |
+
psutil==5.9.5
|
| 40 |
+
Pillow==10.0.1
|
| 41 |
+
|
| 42 |
+
# HTTP & Async
|
| 43 |
+
httpx==0.25.0
|
| 44 |
+
aiofiles==23.2.1
|
| 45 |
+
|
| 46 |
+
# Caching
|
| 47 |
+
diskcache==5.6.3
|
scraper_module.py
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import feedparser
|
| 4 |
+
import trafilatura
|
| 5 |
+
from urllib.parse import urljoin, urlparse
|
| 6 |
+
import time
|
| 7 |
+
import logging
|
| 8 |
+
from datetime import datetime, timedelta
|
| 9 |
+
from typing import List, Dict, Optional, Set
|
| 10 |
+
import hashlib
|
| 11 |
+
import re
|
| 12 |
+
from langdetect import detect
|
| 13 |
+
import random
|
| 14 |
+
from requests.adapters import HTTPAdapter
|
| 15 |
+
from urllib3.util.retry import Retry
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
class NewsletterScraper:
|
| 20 |
+
"""Robust news scraper with multiple sources and deduplication"""
|
| 21 |
+
|
| 22 |
+
def __init__(self):
|
| 23 |
+
self.session = self._create_session()
|
| 24 |
+
self.scraped_urls: Set[str] = set()
|
| 25 |
+
self.content_hashes: Set[str] = set()
|
| 26 |
+
|
| 27 |
+
# News sources configuration
|
| 28 |
+
self.rss_sources = {
|
| 29 |
+
'google_news': 'https://news.google.com/rss/search?q={}&hl=en&gl=US&ceid=US:en',
|
| 30 |
+
'yahoo_finance': 'https://feeds.finance.yahoo.com/rss/2.0/headline',
|
| 31 |
+
'reuters_business': 'https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best',
|
| 32 |
+
'bbc_business': 'http://feeds.bbci.co.uk/news/business/rss.xml',
|
| 33 |
+
'cnbc': 'https://www.cnbc.com/id/100003114/device/rss/rss.html',
|
| 34 |
+
'marketwatch': 'http://feeds.marketwatch.com/marketwatch/topstories/',
|
| 35 |
+
'financial_times': 'https://www.ft.com/rss/home',
|
| 36 |
+
'bloomberg': 'https://feeds.bloomberg.com/politics/news.rss'
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
self.user_agents = [
|
| 40 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 41 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 42 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 43 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
|
| 44 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
logger.info("NewsletterScraper initialized")
|
| 48 |
+
|
| 49 |
+
def _create_session(self) -> requests.Session:
|
| 50 |
+
"""Create a session with retry strategy"""
|
| 51 |
+
session = requests.Session()
|
| 52 |
+
|
| 53 |
+
# Retry strategy
|
| 54 |
+
retry_strategy = Retry(
|
| 55 |
+
total=3,
|
| 56 |
+
backoff_factor=1,
|
| 57 |
+
status_forcelist=[429, 500, 502, 503, 504],
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
| 61 |
+
session.mount("http://", adapter)
|
| 62 |
+
session.mount("https://", adapter)
|
| 63 |
+
|
| 64 |
+
return session
|
| 65 |
+
|
| 66 |
+
def _get_random_headers(self) -> Dict[str, str]:
|
| 67 |
+
"""Get randomized headers to avoid blocking"""
|
| 68 |
+
return {
|
| 69 |
+
'User-Agent': random.choice(self.user_agents),
|
| 70 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 71 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 72 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 73 |
+
'Connection': 'keep-alive',
|
| 74 |
+
'Upgrade-Insecure-Requests': '1',
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
|
| 78 |
+
"""Main scraping function"""
|
| 79 |
+
logger.info(f"Starting news scraping for query: {query}")
|
| 80 |
+
|
| 81 |
+
all_articles = []
|
| 82 |
+
self.scraped_urls.clear()
|
| 83 |
+
self.content_hashes.clear()
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
# Primary: Google News RSS
|
| 87 |
+
google_articles = self._scrape_google_news(query, max_articles // 2)
|
| 88 |
+
all_articles.extend(google_articles)
|
| 89 |
+
|
| 90 |
+
# Secondary: Other RSS sources
|
| 91 |
+
for source_name, rss_url in list(self.rss_sources.items())[1:4]: # Limit to avoid timeouts
|
| 92 |
+
if len(all_articles) >= max_articles:
|
| 93 |
+
break
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
source_articles = self._scrape_rss_source(rss_url, query, 5)
|
| 97 |
+
all_articles.extend(source_articles)
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.warning(f"Failed to scrape {source_name}: {str(e)}")
|
| 100 |
+
continue
|
| 101 |
+
|
| 102 |
+
# Deduplicate and filter
|
| 103 |
+
articles = self._deduplicate_articles(all_articles)
|
| 104 |
+
articles = self._filter_articles(articles, query)
|
| 105 |
+
articles = articles[:max_articles]
|
| 106 |
+
|
| 107 |
+
# Extract full content
|
| 108 |
+
for article in articles:
|
| 109 |
+
try:
|
| 110 |
+
full_content = self._extract_full_content(article['url'])
|
| 111 |
+
if full_content and len(full_content) > 200:
|
| 112 |
+
article['content'] = full_content
|
| 113 |
+
else:
|
| 114 |
+
article['content'] = article.get('summary', article.get('title', ''))
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.warning(f"Failed to extract content from {article['url']}: {str(e)}")
|
| 117 |
+
article['content'] = article.get('summary', article.get('title', ''))
|
| 118 |
+
|
| 119 |
+
# Filter by language (English only)
|
| 120 |
+
articles = [article for article in articles if self._is_english(article['content'])]
|
| 121 |
+
|
| 122 |
+
logger.info(f"Successfully scraped {len(articles)} articles")
|
| 123 |
+
return articles
|
| 124 |
+
|
| 125 |
+
except Exception as e:
|
| 126 |
+
logger.error(f"Error in scrape_news: {str(e)}")
|
| 127 |
+
return []
|
| 128 |
+
|
| 129 |
+
def _scrape_google_news(self, query: str, max_articles: int) -> List[Dict]:
|
| 130 |
+
"""Scrape Google News RSS"""
|
| 131 |
+
try:
|
| 132 |
+
url = self.rss_sources['google_news'].format(query.replace(' ', '%20'))
|
| 133 |
+
|
| 134 |
+
headers = self._get_random_headers()
|
| 135 |
+
response = self.session.get(url, headers=headers, timeout=10)
|
| 136 |
+
|
| 137 |
+
if response.status_code != 200:
|
| 138 |
+
logger.warning(f"Google News RSS returned status {response.status_code}")
|
| 139 |
+
return []
|
| 140 |
+
|
| 141 |
+
feed = feedparser.parse(response.content)
|
| 142 |
+
articles = []
|
| 143 |
+
|
| 144 |
+
for entry in feed.entries[:max_articles * 2]: # Get extra for filtering
|
| 145 |
+
try:
|
| 146 |
+
article = {
|
| 147 |
+
'title': entry.title,
|
| 148 |
+
'url': entry.link,
|
| 149 |
+
'summary': entry.get('summary', ''),
|
| 150 |
+
'date': self._parse_date(entry.get('published', '')),
|
| 151 |
+
'source': 'Google News'
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
# Skip if already seen
|
| 155 |
+
if article['url'] in self.scraped_urls:
|
| 156 |
+
continue
|
| 157 |
+
|
| 158 |
+
self.scraped_urls.add(article['url'])
|
| 159 |
+
articles.append(article)
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
logger.warning(f"Error parsing Google News entry: {str(e)}")
|
| 163 |
+
continue
|
| 164 |
+
|
| 165 |
+
return articles
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"Error scraping Google News: {str(e)}")
|
| 169 |
+
return []
|
| 170 |
+
|
| 171 |
+
def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
|
| 172 |
+
"""Scrape a generic RSS source"""
|
| 173 |
+
try:
|
| 174 |
+
headers = self._get_random_headers()
|
| 175 |
+
response = self.session.get(rss_url, headers=headers, timeout=10)
|
| 176 |
+
|
| 177 |
+
if response.status_code != 200:
|
| 178 |
+
return []
|
| 179 |
+
|
| 180 |
+
feed = feedparser.parse(response.content)
|
| 181 |
+
articles = []
|
| 182 |
+
query_lower = query.lower()
|
| 183 |
+
|
| 184 |
+
for entry in feed.entries[:max_articles * 3]: # Get extra for filtering
|
| 185 |
+
try:
|
| 186 |
+
title = entry.get('title', '')
|
| 187 |
+
summary = entry.get('summary', '')
|
| 188 |
+
|
| 189 |
+
# Check if article is relevant to query
|
| 190 |
+
if not (query_lower in title.lower() or query_lower in summary.lower()):
|
| 191 |
+
continue
|
| 192 |
+
|
| 193 |
+
article = {
|
| 194 |
+
'title': title,
|
| 195 |
+
'url': entry.get('link', ''),
|
| 196 |
+
'summary': summary,
|
| 197 |
+
'date': self._parse_date(entry.get('published', '')),
|
| 198 |
+
'source': self._extract_source_name(rss_url)
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
# Skip if already seen
|
| 202 |
+
if article['url'] in self.scraped_urls:
|
| 203 |
+
continue
|
| 204 |
+
|
| 205 |
+
self.scraped_urls.add(article['url'])
|
| 206 |
+
articles.append(article)
|
| 207 |
+
|
| 208 |
+
if len(articles) >= max_articles:
|
| 209 |
+
break
|
| 210 |
+
|
| 211 |
+
except Exception as e:
|
| 212 |
+
logger.warning(f"Error parsing RSS entry: {str(e)}")
|
| 213 |
+
continue
|
| 214 |
+
|
| 215 |
+
# Small delay to be respectful
|
| 216 |
+
time.sleep(0.5)
|
| 217 |
+
|
| 218 |
+
return articles
|
| 219 |
+
|
| 220 |
+
except Exception as e:
|
| 221 |
+
logger.error(f"Error scraping RSS {rss_url}: {str(e)}")
|
| 222 |
+
return []
|
| 223 |
+
|
| 224 |
+
def _extract_full_content(self, url: str) -> Optional[str]:
|
| 225 |
+
"""Extract full article content using trafilatura"""
|
| 226 |
+
try:
|
| 227 |
+
headers = self._get_random_headers()
|
| 228 |
+
|
| 229 |
+
# Download the page
|
| 230 |
+
downloaded = trafilatura.fetch_url(url, headers=headers)
|
| 231 |
+
|
| 232 |
+
if not downloaded:
|
| 233 |
+
return None
|
| 234 |
+
|
| 235 |
+
# Extract text content
|
| 236 |
+
text = trafilatura.extract(
|
| 237 |
+
downloaded,
|
| 238 |
+
include_comments=False,
|
| 239 |
+
include_tables=False,
|
| 240 |
+
include_formatting=False,
|
| 241 |
+
no_fallback=False
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
if text and len(text.strip()) > 100:
|
| 245 |
+
return text.strip()
|
| 246 |
+
|
| 247 |
+
return None
|
| 248 |
+
|
| 249 |
+
except Exception as e:
|
| 250 |
+
logger.warning(f"Error extracting content from {url}: {str(e)}")
|
| 251 |
+
return None
|
| 252 |
+
|
| 253 |
+
def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
|
| 254 |
+
"""Remove duplicate articles based on content similarity"""
|
| 255 |
+
unique_articles = []
|
| 256 |
+
|
| 257 |
+
for article in articles:
|
| 258 |
+
# Create content hash
|
| 259 |
+
content_for_hash = f"{article['title']} {article.get('summary', '')}"
|
| 260 |
+
content_hash = hashlib.md5(content_for_hash.encode()).hexdigest()
|
| 261 |
+
|
| 262 |
+
if content_hash not in self.content_hashes:
|
| 263 |
+
self.content_hashes.add(content_hash)
|
| 264 |
+
unique_articles.append(article)
|
| 265 |
+
|
| 266 |
+
logger.info(f"Deduplicated {len(articles)} -> {len(unique_articles)} articles")
|
| 267 |
+
return unique_articles
|
| 268 |
+
|
| 269 |
+
def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
|
| 270 |
+
"""Filter articles for relevance and quality"""
|
| 271 |
+
filtered_articles = []
|
| 272 |
+
query_lower = query.lower()
|
| 273 |
+
|
| 274 |
+
for article in articles:
|
| 275 |
+
# Check minimum content length
|
| 276 |
+
title_summary = f"{article['title']} {article.get('summary', '')}"
|
| 277 |
+
if len(title_summary.strip()) < 50:
|
| 278 |
+
continue
|
| 279 |
+
|
| 280 |
+
# Check relevance (more flexible than RSS filtering)
|
| 281 |
+
if (query_lower in article['title'].lower() or
|
| 282 |
+
query_lower in article.get('summary', '').lower() or
|
| 283 |
+
any(word in article['title'].lower() for word in query_lower.split())):
|
| 284 |
+
|
| 285 |
+
filtered_articles.append(article)
|
| 286 |
+
|
| 287 |
+
logger.info(f"Filtered {len(articles)} -> {len(filtered_articles)} articles for relevance")
|
| 288 |
+
return filtered_articles
|
| 289 |
+
|
| 290 |
+
def _is_english(self, text: str) -> bool:
|
| 291 |
+
"""Check if text is in English using language detection"""
|
| 292 |
+
try:
|
| 293 |
+
if len(text.strip()) < 20:
|
| 294 |
+
return True # Assume short text is English
|
| 295 |
+
|
| 296 |
+
detected_lang = detect(text[:1000]) # Check first 1000 chars
|
| 297 |
+
return detected_lang == 'en'
|
| 298 |
+
|
| 299 |
+
except Exception:
|
| 300 |
+
# If detection fails, assume English
|
| 301 |
+
return True
|
| 302 |
+
|
| 303 |
+
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
| 304 |
+
"""Parse date from RSS feed"""
|
| 305 |
+
if not date_str:
|
| 306 |
+
return datetime.now()
|
| 307 |
+
|
| 308 |
+
try:
|
| 309 |
+
# Try common RSS date formats
|
| 310 |
+
for fmt in ['%a, %d %b %Y %H:%M:%S %Z',
|
| 311 |
+
'%Y-%m-%dT%H:%M:%SZ',
|
| 312 |
+
'%Y-%m-%d %H:%M:%S']:
|
| 313 |
+
try:
|
| 314 |
+
return datetime.strptime(date_str.strip(), fmt)
|
| 315 |
+
except ValueError:
|
| 316 |
+
continue
|
| 317 |
+
|
| 318 |
+
# If all fails, return current time
|
| 319 |
+
return datetime.now()
|
| 320 |
+
|
| 321 |
+
except Exception:
|
| 322 |
+
return datetime.now()
|
| 323 |
+
|
| 324 |
+
def _extract_source_name(self, url: str) -> str:
|
| 325 |
+
"""Extract source name from URL"""
|
| 326 |
+
try:
|
| 327 |
+
domain = urlparse(url).netloc
|
| 328 |
+
|
| 329 |
+
# Clean up common domain patterns
|
| 330 |
+
domain = domain.replace('www.', '').replace('feeds.', '')
|
| 331 |
+
|
| 332 |
+
# Map known domains to clean names
|
| 333 |
+
domain_mapping = {
|
| 334 |
+
'news.google.com': 'Google News',
|
| 335 |
+
'finance.yahoo.com': 'Yahoo Finance',
|
| 336 |
+
'reuters.com': 'Reuters',
|
| 337 |
+
'reutersagency.com': 'Reuters',
|
| 338 |
+
'bbc.co.uk': 'BBC',
|
| 339 |
+
'cnbc.com': 'CNBC',
|
| 340 |
+
'marketwatch.com': 'MarketWatch',
|
| 341 |
+
'ft.com': 'Financial Times',
|
| 342 |
+
'bloomberg.com': 'Bloomberg'
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
return domain_mapping.get(domain, domain.title())
|
| 346 |
+
|
| 347 |
+
except Exception:
|
| 348 |
+
return 'Unknown'
|
| 349 |
+
|
| 350 |
+
def get_available_sources(self) -> List[str]:
|
| 351 |
+
"""Get list of available news sources"""
|
| 352 |
+
return list(self.rss_sources.keys())
|
| 353 |
+
|
| 354 |
+
# Additional utility functions for scraping
|
| 355 |
+
def clean_html(html_content: str) -> str:
|
| 356 |
+
"""Clean HTML content and extract text"""
|
| 357 |
+
try:
|
| 358 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 359 |
+
|
| 360 |
+
# Remove script and style elements
|
| 361 |
+
for script in soup(["script", "style"]):
|
| 362 |
+
script.extract()
|
| 363 |
+
|
| 364 |
+
# Get text
|
| 365 |
+
text = soup.get_text()
|
| 366 |
+
|
| 367 |
+
# Clean up whitespace
|
| 368 |
+
lines = (line.strip() for line in text.splitlines())
|
| 369 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
| 370 |
+
text = ' '.join(chunk for chunk in chunks if chunk)
|
| 371 |
+
|
| 372 |
+
return text
|
| 373 |
+
|
| 374 |
+
except Exception as e:
|
| 375 |
+
logger.error(f"Error cleaning HTML: {str(e)}")
|
| 376 |
+
return ""
|
| 377 |
+
|
| 378 |
+
def is_valid_article_url(url: str) -> bool:
|
| 379 |
+
"""Check if URL is likely to be a valid article URL"""
|
| 380 |
+
try:
|
| 381 |
+
parsed = urlparse(url)
|
| 382 |
+
|
| 383 |
+
# Skip certain file types
|
| 384 |
+
skip_extensions = ['.pdf', '.jpg', '.png', '.gif', '.mp4', '.mp3']
|
| 385 |
+
if any(url.lower().endswith(ext) for ext in skip_extensions):
|
| 386 |
+
return False
|
| 387 |
+
|
| 388 |
+
# Skip obvious non-article URLs
|
| 389 |
+
skip_patterns = ['login', 'register', 'subscribe', 'newsletter', 'sitemap']
|
| 390 |
+
if any(pattern in url.lower() for pattern in skip_patterns):
|
| 391 |
+
return False
|
| 392 |
+
|
| 393 |
+
return True
|
| 394 |
+
|
| 395 |
+
except Exception:
|
| 396 |
+
return False
|
streamlit_app.py
ADDED
|
@@ -0,0 +1,562 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import plotly.express as px
|
| 4 |
+
import plotly.graph_objects as go
|
| 5 |
+
from wordcloud import WordCloud
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import asyncio
|
| 8 |
+
import json
|
| 9 |
+
import base64
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import io
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
# Import our modules
|
| 15 |
+
from api import NewsAnalyzer
|
| 16 |
+
from utils import load_config, cache_results
|
| 17 |
+
from report import generate_pdf_report
|
| 18 |
+
|
| 19 |
+
# Configure page
|
| 20 |
+
st.set_page_config(
|
| 21 |
+
page_title="Global Business News Intelligence Dashboard",
|
| 22 |
+
page_icon="📊",
|
| 23 |
+
layout="wide",
|
| 24 |
+
initial_sidebar_state="expanded"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Custom CSS
|
| 28 |
+
st.markdown("""
|
| 29 |
+
<style>
|
| 30 |
+
.main-header {
|
| 31 |
+
font-size: 2.5rem;
|
| 32 |
+
font-weight: bold;
|
| 33 |
+
text-align: center;
|
| 34 |
+
color: #2E86AB;
|
| 35 |
+
margin-bottom: 2rem;
|
| 36 |
+
}
|
| 37 |
+
.metric-card {
|
| 38 |
+
background-color: #f0f2f6;
|
| 39 |
+
padding: 1rem;
|
| 40 |
+
border-radius: 10px;
|
| 41 |
+
border-left: 4px solid #2E86AB;
|
| 42 |
+
}
|
| 43 |
+
.sentiment-positive { color: #28a745; font-weight: bold; }
|
| 44 |
+
.sentiment-negative { color: #dc3545; font-weight: bold; }
|
| 45 |
+
.sentiment-neutral { color: #6c757d; font-weight: bold; }
|
| 46 |
+
.audio-container {
|
| 47 |
+
background-color: #f8f9fa;
|
| 48 |
+
padding: 10px;
|
| 49 |
+
border-radius: 5px;
|
| 50 |
+
margin: 10px 0;
|
| 51 |
+
}
|
| 52 |
+
</style>
|
| 53 |
+
""", unsafe_allow_html=True)
|
| 54 |
+
|
| 55 |
+
# Initialize session state
|
| 56 |
+
if 'analyzer' not in st.session_state:
|
| 57 |
+
st.session_state.analyzer = NewsAnalyzer()
|
| 58 |
+
if 'results' not in st.session_state:
|
| 59 |
+
st.session_state.results = None
|
| 60 |
+
if 'analysis_complete' not in st.session_state:
|
| 61 |
+
st.session_state.analysis_complete = False
|
| 62 |
+
|
| 63 |
+
def main():
|
| 64 |
+
# Header
|
| 65 |
+
st.markdown('<h1 class="main-header">🌐 Global Business News Intelligence Dashboard</h1>', unsafe_allow_html=True)
|
| 66 |
+
st.markdown("**Real-time sentiment analysis, multilingual summaries, and audio insights for business intelligence**")
|
| 67 |
+
|
| 68 |
+
# Sidebar
|
| 69 |
+
with st.sidebar:
|
| 70 |
+
st.header("⚙️ Configuration")
|
| 71 |
+
|
| 72 |
+
# Input section
|
| 73 |
+
st.subheader("🎯 Target Analysis")
|
| 74 |
+
query_type = st.selectbox("Query Type", ["Company", "Stock Ticker", "Keyword", "Industry"])
|
| 75 |
+
query = st.text_input(f"Enter {query_type}:", placeholder="e.g., Tesla, TSLA, AI technology")
|
| 76 |
+
|
| 77 |
+
st.subheader("📊 Analysis Settings")
|
| 78 |
+
num_articles = st.slider("Number of Articles", 5, 50, 20)
|
| 79 |
+
languages = st.multiselect(
|
| 80 |
+
"Summary Languages",
|
| 81 |
+
["English", "Hindi", "Tamil"],
|
| 82 |
+
default=["English"]
|
| 83 |
+
)
|
| 84 |
+
include_audio = st.checkbox("Generate Audio Summaries", True)
|
| 85 |
+
|
| 86 |
+
st.subheader("🔧 Model Settings")
|
| 87 |
+
sentiment_models = st.multiselect(
|
| 88 |
+
"Sentiment Models",
|
| 89 |
+
["VADER", "Loughran-McDonald", "FinBERT"],
|
| 90 |
+
default=["VADER", "Loughran-McDonald", "FinBERT"]
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# Analysis button
|
| 94 |
+
analyze_button = st.button("🚀 Analyze News", type="primary", use_container_width=True)
|
| 95 |
+
|
| 96 |
+
# Main content area
|
| 97 |
+
if analyze_button and query:
|
| 98 |
+
st.session_state.analysis_complete = False
|
| 99 |
+
with st.spinner("🔍 Analyzing news articles... This may take a few minutes."):
|
| 100 |
+
try:
|
| 101 |
+
# Create progress bar
|
| 102 |
+
progress_bar = st.progress(0)
|
| 103 |
+
status_text = st.empty()
|
| 104 |
+
|
| 105 |
+
# Run analysis
|
| 106 |
+
config = {
|
| 107 |
+
'query': query,
|
| 108 |
+
'num_articles': num_articles,
|
| 109 |
+
'languages': languages,
|
| 110 |
+
'include_audio': include_audio,
|
| 111 |
+
'sentiment_models': sentiment_models
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
# Update progress
|
| 115 |
+
status_text.text("🔍 Scraping articles...")
|
| 116 |
+
progress_bar.progress(20)
|
| 117 |
+
|
| 118 |
+
results = st.session_state.analyzer.analyze_news(config, progress_callback=update_progress)
|
| 119 |
+
st.session_state.results = results
|
| 120 |
+
st.session_state.analysis_complete = True
|
| 121 |
+
|
| 122 |
+
progress_bar.progress(100)
|
| 123 |
+
status_text.text("✅ Analysis complete!")
|
| 124 |
+
|
| 125 |
+
except Exception as e:
|
| 126 |
+
st.error(f"Error during analysis: {str(e)}")
|
| 127 |
+
st.session_state.analysis_complete = False
|
| 128 |
+
|
| 129 |
+
# Display results
|
| 130 |
+
if st.session_state.analysis_complete and st.session_state.results:
|
| 131 |
+
display_results(st.session_state.results)
|
| 132 |
+
|
| 133 |
+
elif not st.session_state.analysis_complete and query:
|
| 134 |
+
st.info("👆 Click 'Analyze News' to start the analysis")
|
| 135 |
+
|
| 136 |
+
else:
|
| 137 |
+
show_demo_dashboard()
|
| 138 |
+
|
| 139 |
+
def update_progress(progress, status):
|
| 140 |
+
"""Callback function for progress updates"""
|
| 141 |
+
# This would be called from the analyzer
|
| 142 |
+
pass
|
| 143 |
+
|
| 144 |
+
def display_results(results):
|
| 145 |
+
"""Display analysis results with interactive dashboard"""
|
| 146 |
+
st.header(f"📈 Analysis Results for: {results['query']}")
|
| 147 |
+
|
| 148 |
+
# Key metrics
|
| 149 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 150 |
+
|
| 151 |
+
with col1:
|
| 152 |
+
st.markdown('<div class="metric-card">', unsafe_allow_html=True)
|
| 153 |
+
st.metric("Articles Analyzed", len(results['articles']))
|
| 154 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 155 |
+
|
| 156 |
+
with col2:
|
| 157 |
+
avg_sentiment = results['summary']['average_sentiment']
|
| 158 |
+
sentiment_color = "sentiment-positive" if avg_sentiment > 0.1 else "sentiment-negative" if avg_sentiment < -0.1 else "sentiment-neutral"
|
| 159 |
+
st.markdown('<div class="metric-card">', unsafe_allow_html=True)
|
| 160 |
+
st.metric("Average Sentiment", f"{avg_sentiment:.3f}")
|
| 161 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 162 |
+
|
| 163 |
+
with col3:
|
| 164 |
+
st.markdown('<div class="metric-card">', unsafe_allow_html=True)
|
| 165 |
+
st.metric("Sources", len(set([article['source'] for article in results['articles']])))
|
| 166 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 167 |
+
|
| 168 |
+
with col4:
|
| 169 |
+
st.markdown('<div class="metric-card">', unsafe_allow_html=True)
|
| 170 |
+
st.metric("Languages", len(results.get('languages', ['English'])))
|
| 171 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 172 |
+
|
| 173 |
+
# Tabs for different views
|
| 174 |
+
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["📊 Dashboard", "📰 Articles", "🎯 Sentiment", "🗣️ Audio", "📤 Export", "🔌 API"])
|
| 175 |
+
|
| 176 |
+
with tab1:
|
| 177 |
+
display_dashboard(results)
|
| 178 |
+
|
| 179 |
+
with tab2:
|
| 180 |
+
display_articles(results)
|
| 181 |
+
|
| 182 |
+
with tab3:
|
| 183 |
+
display_sentiment_analysis(results)
|
| 184 |
+
|
| 185 |
+
with tab4:
|
| 186 |
+
display_audio_summaries(results)
|
| 187 |
+
|
| 188 |
+
with tab5:
|
| 189 |
+
display_export_options(results)
|
| 190 |
+
|
| 191 |
+
with tab6:
|
| 192 |
+
display_api_info(results)
|
| 193 |
+
|
| 194 |
+
def display_dashboard(results):
|
| 195 |
+
"""Display main dashboard with charts"""
|
| 196 |
+
col1, col2 = st.columns(2)
|
| 197 |
+
|
| 198 |
+
with col1:
|
| 199 |
+
# Sentiment distribution
|
| 200 |
+
st.subheader("📊 Sentiment Distribution")
|
| 201 |
+
sentiment_counts = {
|
| 202 |
+
'Positive': sum(1 for article in results['articles'] if article['sentiment']['compound'] > 0.1),
|
| 203 |
+
'Negative': sum(1 for article in results['articles'] if article['sentiment']['compound'] < -0.1),
|
| 204 |
+
'Neutral': sum(1 for article in results['articles'] if -0.1 <= article['sentiment']['compound'] <= 0.1)
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
fig_pie = px.pie(
|
| 208 |
+
values=list(sentiment_counts.values()),
|
| 209 |
+
names=list(sentiment_counts.keys()),
|
| 210 |
+
color_discrete_map={'Positive': '#28a745', 'Negative': '#dc3545', 'Neutral': '#6c757d'}
|
| 211 |
+
)
|
| 212 |
+
st.plotly_chart(fig_pie, use_container_width=True)
|
| 213 |
+
|
| 214 |
+
with col2:
|
| 215 |
+
# Source distribution
|
| 216 |
+
st.subheader("📰 Source Distribution")
|
| 217 |
+
source_counts = {}
|
| 218 |
+
for article in results['articles']:
|
| 219 |
+
source = article['source']
|
| 220 |
+
source_counts[source] = source_counts.get(source, 0) + 1
|
| 221 |
+
|
| 222 |
+
fig_bar = px.bar(
|
| 223 |
+
x=list(source_counts.keys()),
|
| 224 |
+
y=list(source_counts.values()),
|
| 225 |
+
color=list(source_counts.values()),
|
| 226 |
+
color_continuous_scale="viridis"
|
| 227 |
+
)
|
| 228 |
+
fig_bar.update_layout(xaxis_title="Source", yaxis_title="Article Count")
|
| 229 |
+
st.plotly_chart(fig_bar, use_container_width=True)
|
| 230 |
+
|
| 231 |
+
# Timeline chart
|
| 232 |
+
st.subheader("📈 Sentiment Over Time")
|
| 233 |
+
if results['articles']:
|
| 234 |
+
df_timeline = pd.DataFrame([
|
| 235 |
+
{
|
| 236 |
+
'date': article.get('date', datetime.now()),
|
| 237 |
+
'sentiment': article['sentiment']['compound'],
|
| 238 |
+
'title': article['title'][:50] + "..." if len(article['title']) > 50 else article['title']
|
| 239 |
+
}
|
| 240 |
+
for article in results['articles']
|
| 241 |
+
if 'date' in article
|
| 242 |
+
])
|
| 243 |
+
|
| 244 |
+
if not df_timeline.empty:
|
| 245 |
+
fig_timeline = px.scatter(
|
| 246 |
+
df_timeline,
|
| 247 |
+
x='date',
|
| 248 |
+
y='sentiment',
|
| 249 |
+
hover_data=['title'],
|
| 250 |
+
color='sentiment',
|
| 251 |
+
color_continuous_scale=['red', 'gray', 'green'],
|
| 252 |
+
color_continuous_midpoint=0
|
| 253 |
+
)
|
| 254 |
+
fig_timeline.update_layout(
|
| 255 |
+
xaxis_title="Date",
|
| 256 |
+
yaxis_title="Sentiment Score",
|
| 257 |
+
yaxis=dict(range=[-1, 1])
|
| 258 |
+
)
|
| 259 |
+
st.plotly_chart(fig_timeline, use_container_width=True)
|
| 260 |
+
|
| 261 |
+
# Keywords word cloud
|
| 262 |
+
st.subheader("🔤 Key Topics")
|
| 263 |
+
if 'keywords' in results and results['keywords']:
|
| 264 |
+
col1, col2 = st.columns([2, 1])
|
| 265 |
+
|
| 266 |
+
with col1:
|
| 267 |
+
# Create word cloud
|
| 268 |
+
keywords_text = ' '.join([kw['keyword'] for kw in results['keywords'][:50]])
|
| 269 |
+
if keywords_text:
|
| 270 |
+
wordcloud = WordCloud(
|
| 271 |
+
width=800,
|
| 272 |
+
height=400,
|
| 273 |
+
background_color='white',
|
| 274 |
+
colormap='viridis'
|
| 275 |
+
).generate(keywords_text)
|
| 276 |
+
|
| 277 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
| 278 |
+
ax.imshow(wordcloud, interpolation='bilinear')
|
| 279 |
+
ax.axis('off')
|
| 280 |
+
st.pyplot(fig)
|
| 281 |
+
|
| 282 |
+
with col2:
|
| 283 |
+
st.write("**Top Keywords:**")
|
| 284 |
+
for i, kw in enumerate(results['keywords'][:10]):
|
| 285 |
+
st.write(f"{i+1}. {kw['keyword']} ({kw['score']:.3f})")
|
| 286 |
+
|
| 287 |
+
def display_articles(results):
|
| 288 |
+
"""Display individual articles with summaries"""
|
| 289 |
+
st.subheader(f"📰 Articles ({len(results['articles'])})")
|
| 290 |
+
|
| 291 |
+
for i, article in enumerate(results['articles']):
|
| 292 |
+
with st.expander(f"📄 {article['title']}", expanded=(i < 3)):
|
| 293 |
+
col1, col2 = st.columns([3, 1])
|
| 294 |
+
|
| 295 |
+
with col1:
|
| 296 |
+
st.write(f"**Source:** {article['source']}")
|
| 297 |
+
if 'date' in article:
|
| 298 |
+
st.write(f"**Date:** {article['date']}")
|
| 299 |
+
st.write(f"**URL:** {article.get('url', 'N/A')}")
|
| 300 |
+
|
| 301 |
+
# Sentiment
|
| 302 |
+
sentiment = article['sentiment']
|
| 303 |
+
sentiment_label = "Positive" if sentiment['compound'] > 0.1 else "Negative" if sentiment['compound'] < -0.1 else "Neutral"
|
| 304 |
+
sentiment_color = "sentiment-positive" if sentiment_label == "Positive" else "sentiment-negative" if sentiment_label == "Negative" else "sentiment-neutral"
|
| 305 |
+
st.markdown(f"**Sentiment:** <span class='{sentiment_color}'>{sentiment_label} ({sentiment['compound']:.3f})</span>", unsafe_allow_html=True)
|
| 306 |
+
|
| 307 |
+
with col2:
|
| 308 |
+
# Model-specific scores
|
| 309 |
+
st.write("**Model Scores:**")
|
| 310 |
+
if 'vader' in sentiment:
|
| 311 |
+
st.write(f"VADER: {sentiment['vader']:.3f}")
|
| 312 |
+
if 'loughran_mcdonald' in sentiment:
|
| 313 |
+
st.write(f"L&M: {sentiment['loughran_mcdonald']:.3f}")
|
| 314 |
+
if 'finbert' in sentiment:
|
| 315 |
+
st.write(f"FinBERT: {sentiment['finbert']:.3f}")
|
| 316 |
+
|
| 317 |
+
# Summary
|
| 318 |
+
if 'summary' in article:
|
| 319 |
+
st.write("**Summary:**")
|
| 320 |
+
st.write(article['summary'])
|
| 321 |
+
|
| 322 |
+
# Multilingual summaries
|
| 323 |
+
if 'summaries' in article:
|
| 324 |
+
for lang, summary in article['summaries'].items():
|
| 325 |
+
if lang != 'English':
|
| 326 |
+
st.write(f"**Summary ({lang}):**")
|
| 327 |
+
st.write(summary)
|
| 328 |
+
|
| 329 |
+
def display_sentiment_analysis(results):
|
| 330 |
+
"""Display detailed sentiment analysis"""
|
| 331 |
+
st.subheader("🎯 Detailed Sentiment Analysis")
|
| 332 |
+
|
| 333 |
+
# Model comparison
|
| 334 |
+
if results['articles']:
|
| 335 |
+
model_data = []
|
| 336 |
+
for article in results['articles']:
|
| 337 |
+
sentiment = article['sentiment']
|
| 338 |
+
row = {'title': article['title'][:30] + "..."}
|
| 339 |
+
if 'vader' in sentiment:
|
| 340 |
+
row['VADER'] = sentiment['vader']
|
| 341 |
+
if 'loughran_mcdonald' in sentiment:
|
| 342 |
+
row['Loughran-McDonald'] = sentiment['loughran_mcdonald']
|
| 343 |
+
if 'finbert' in sentiment:
|
| 344 |
+
row['FinBERT'] = sentiment['finbert']
|
| 345 |
+
row['Final Score'] = sentiment['compound']
|
| 346 |
+
model_data.append(row)
|
| 347 |
+
|
| 348 |
+
df_models = pd.DataFrame(model_data)
|
| 349 |
+
st.write("**Model Comparison:**")
|
| 350 |
+
st.dataframe(df_models, use_container_width=True)
|
| 351 |
+
|
| 352 |
+
# Correlation heatmap
|
| 353 |
+
numeric_cols = [col for col in df_models.columns if col != 'title']
|
| 354 |
+
if len(numeric_cols) > 1:
|
| 355 |
+
corr_matrix = df_models[numeric_cols].corr()
|
| 356 |
+
fig_heatmap = px.imshow(
|
| 357 |
+
corr_matrix,
|
| 358 |
+
text_auto=True,
|
| 359 |
+
aspect="auto",
|
| 360 |
+
color_continuous_scale="RdBu_r",
|
| 361 |
+
color_continuous_midpoint=0
|
| 362 |
+
)
|
| 363 |
+
fig_heatmap.update_layout(title="Model Correlation Matrix")
|
| 364 |
+
st.plotly_chart(fig_heatmap, use_container_width=True)
|
| 365 |
+
|
| 366 |
+
# Top positive and negative articles
|
| 367 |
+
col1, col2 = st.columns(2)
|
| 368 |
+
|
| 369 |
+
with col1:
|
| 370 |
+
st.write("**Most Positive Articles:**")
|
| 371 |
+
positive_articles = sorted(
|
| 372 |
+
results['articles'],
|
| 373 |
+
key=lambda x: x['sentiment']['compound'],
|
| 374 |
+
reverse=True
|
| 375 |
+
)[:5]
|
| 376 |
+
|
| 377 |
+
for article in positive_articles:
|
| 378 |
+
st.write(f"• {article['title'][:50]}... ({article['sentiment']['compound']:.3f})")
|
| 379 |
+
|
| 380 |
+
with col2:
|
| 381 |
+
st.write("**Most Negative Articles:**")
|
| 382 |
+
negative_articles = sorted(
|
| 383 |
+
results['articles'],
|
| 384 |
+
key=lambda x: x['sentiment']['compound']
|
| 385 |
+
)[:5]
|
| 386 |
+
|
| 387 |
+
for article in negative_articles:
|
| 388 |
+
st.write(f"• {article['title'][:50]}... ({article['sentiment']['compound']:.3f})")
|
| 389 |
+
|
| 390 |
+
def display_audio_summaries(results):
|
| 391 |
+
"""Display audio summaries for different languages"""
|
| 392 |
+
st.subheader("🎵 Audio Summaries")
|
| 393 |
+
|
| 394 |
+
if 'audio_files' in results:
|
| 395 |
+
for lang, audio_file in results['audio_files'].items():
|
| 396 |
+
st.write(f"**{lang} Summary:**")
|
| 397 |
+
|
| 398 |
+
# Create audio player
|
| 399 |
+
if os.path.exists(audio_file):
|
| 400 |
+
with open(audio_file, 'rb') as audio_file_obj:
|
| 401 |
+
audio_bytes = audio_file_obj.read()
|
| 402 |
+
st.audio(audio_bytes, format='audio/mp3')
|
| 403 |
+
else:
|
| 404 |
+
st.write("Audio file not found")
|
| 405 |
+
else:
|
| 406 |
+
st.info("No audio summaries available. Enable audio generation in settings.")
|
| 407 |
+
|
| 408 |
+
def display_export_options(results):
|
| 409 |
+
"""Display export options"""
|
| 410 |
+
st.subheader("📤 Export Results")
|
| 411 |
+
|
| 412 |
+
col1, col2, col3 = st.columns(3)
|
| 413 |
+
|
| 414 |
+
with col1:
|
| 415 |
+
# CSV Export
|
| 416 |
+
if st.button("📊 Download CSV", use_container_width=True):
|
| 417 |
+
csv_data = prepare_csv_export(results)
|
| 418 |
+
st.download_button(
|
| 419 |
+
label="Click to Download CSV",
|
| 420 |
+
data=csv_data,
|
| 421 |
+
file_name=f"news_analysis_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
|
| 422 |
+
mime="text/csv"
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
with col2:
|
| 426 |
+
# JSON Export
|
| 427 |
+
if st.button("📋 Download JSON", use_container_width=True):
|
| 428 |
+
json_data = json.dumps(results, indent=2, default=str)
|
| 429 |
+
st.download_button(
|
| 430 |
+
label="Click to Download JSON",
|
| 431 |
+
data=json_data,
|
| 432 |
+
file_name=f"news_analysis_{datetime.now().strftime('%Y%m%d_%H%M')}.json",
|
| 433 |
+
mime="application/json"
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
with col3:
|
| 437 |
+
# PDF Report
|
| 438 |
+
if st.button("📄 Generate PDF Report", use_container_width=True):
|
| 439 |
+
try:
|
| 440 |
+
pdf_buffer = generate_pdf_report(results)
|
| 441 |
+
st.download_button(
|
| 442 |
+
label="Click to Download PDF",
|
| 443 |
+
data=pdf_buffer,
|
| 444 |
+
file_name=f"news_analysis_report_{datetime.now().strftime('%Y%m%d_%H%M')}.pdf",
|
| 445 |
+
mime="application/pdf"
|
| 446 |
+
)
|
| 447 |
+
except Exception as e:
|
| 448 |
+
st.error(f"Error generating PDF: {str(e)}")
|
| 449 |
+
|
| 450 |
+
def display_api_info(results):
|
| 451 |
+
"""Display API information and examples"""
|
| 452 |
+
st.subheader("🔌 API Access")
|
| 453 |
+
|
| 454 |
+
st.write("**Endpoint:** `/api/analyze`")
|
| 455 |
+
st.write("**Method:** GET")
|
| 456 |
+
st.write("**Parameters:**")
|
| 457 |
+
st.code("""
|
| 458 |
+
- query: string (required) - Company name, ticker, or keyword
|
| 459 |
+
- num_articles: integer (default: 20) - Number of articles to analyze
|
| 460 |
+
- languages: array (default: ["English"]) - Summary languages
|
| 461 |
+
- include_audio: boolean (default: true) - Generate audio summaries
|
| 462 |
+
- sentiment_models: array (default: ["VADER", "Loughran-McDonald", "FinBERT"]) - Models to use
|
| 463 |
+
""")
|
| 464 |
+
|
| 465 |
+
st.write("**Example Request:**")
|
| 466 |
+
st.code(f"GET /api/analyze?query={results['query']}&num_articles=20")
|
| 467 |
+
|
| 468 |
+
st.write("**Sample Response:**")
|
| 469 |
+
sample_response = {
|
| 470 |
+
"query": results['query'],
|
| 471 |
+
"total_articles": len(results['articles']),
|
| 472 |
+
"average_sentiment": results['summary']['average_sentiment'],
|
| 473 |
+
"articles": results['articles'][:2] # Show first 2 articles as example
|
| 474 |
+
}
|
| 475 |
+
st.json(sample_response)
|
| 476 |
+
|
| 477 |
+
def prepare_csv_export(results):
|
| 478 |
+
"""Prepare CSV data for export"""
|
| 479 |
+
csv_data = []
|
| 480 |
+
|
| 481 |
+
for article in results['articles']:
|
| 482 |
+
row = {
|
| 483 |
+
'title': article['title'],
|
| 484 |
+
'source': article['source'],
|
| 485 |
+
'url': article.get('url', ''),
|
| 486 |
+
'date': article.get('date', ''),
|
| 487 |
+
'sentiment_compound': article['sentiment']['compound'],
|
| 488 |
+
'sentiment_label': 'Positive' if article['sentiment']['compound'] > 0.1 else 'Negative' if article['sentiment']['compound'] < -0.1 else 'Neutral',
|
| 489 |
+
'summary': article.get('summary', '')
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
# Add model-specific scores
|
| 493 |
+
if 'vader' in article['sentiment']:
|
| 494 |
+
row['vader_score'] = article['sentiment']['vader']
|
| 495 |
+
if 'loughran_mcdonald' in article['sentiment']:
|
| 496 |
+
row['loughran_mcdonald_score'] = article['sentiment']['loughran_mcdonald']
|
| 497 |
+
if 'finbert' in article['sentiment']:
|
| 498 |
+
row['finbert_score'] = article['sentiment']['finbert']
|
| 499 |
+
|
| 500 |
+
csv_data.append(row)
|
| 501 |
+
|
| 502 |
+
df = pd.DataFrame(csv_data)
|
| 503 |
+
return df.to_csv(index=False)
|
| 504 |
+
|
| 505 |
+
def show_demo_dashboard():
|
| 506 |
+
"""Show demo dashboard with sample data"""
|
| 507 |
+
st.header("🚀 Welcome to Global Business News Intelligence")
|
| 508 |
+
|
| 509 |
+
st.markdown("""
|
| 510 |
+
### Key Features:
|
| 511 |
+
- **🔍 Multi-Source News Scraping:** Aggregates news from reliable sources
|
| 512 |
+
- **🎯 Advanced Sentiment Analysis:** Uses VADER, Loughran-McDonald, and FinBERT models
|
| 513 |
+
- **🌐 Multilingual Support:** Summaries in English, Hindi, and Tamil
|
| 514 |
+
- **🎵 Audio Generation:** Text-to-speech for all language summaries
|
| 515 |
+
- **📊 Interactive Dashboard:** Real-time charts and visualizations
|
| 516 |
+
- **📤 Multiple Export Formats:** CSV, JSON, and PDF reports
|
| 517 |
+
- **🔌 API Access:** Programmatic access to all features
|
| 518 |
+
|
| 519 |
+
### Use Cases:
|
| 520 |
+
- **📈 Investment Research:** Track sentiment around stocks and companies
|
| 521 |
+
- **🏢 Brand Monitoring:** Monitor public perception of your brand
|
| 522 |
+
- **🔍 Market Intelligence:** Stay informed about industry trends
|
| 523 |
+
- **📰 Media Analysis:** Analyze coverage patterns across sources
|
| 524 |
+
- **🌍 Global Insights:** Access news in multiple languages
|
| 525 |
+
|
| 526 |
+
### Get Started:
|
| 527 |
+
1. Enter a company name, stock ticker, or keyword in the sidebar
|
| 528 |
+
2. Configure your analysis settings
|
| 529 |
+
3. Click "Analyze News" to start
|
| 530 |
+
4. Explore results in the interactive dashboard
|
| 531 |
+
5. Export your findings in multiple formats
|
| 532 |
+
""")
|
| 533 |
+
|
| 534 |
+
# Sample visualization
|
| 535 |
+
st.subheader("📊 Sample Analysis Dashboard")
|
| 536 |
+
|
| 537 |
+
# Create sample data
|
| 538 |
+
sample_data = {
|
| 539 |
+
'Sentiment': ['Positive', 'Negative', 'Neutral'],
|
| 540 |
+
'Count': [45, 15, 40]
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
fig = px.pie(
|
| 544 |
+
values=sample_data['Count'],
|
| 545 |
+
names=sample_data['Sentiment'],
|
| 546 |
+
color_discrete_map={'Positive': '#28a745', 'Negative': '#dc3545', 'Neutral': '#6c757d'},
|
| 547 |
+
title="Sample Sentiment Distribution"
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
col1, col2 = st.columns([1, 1])
|
| 551 |
+
with col1:
|
| 552 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 553 |
+
|
| 554 |
+
with col2:
|
| 555 |
+
st.write("**Sample Metrics:**")
|
| 556 |
+
st.metric("Articles Analyzed", "100")
|
| 557 |
+
st.metric("Average Sentiment", "0.234")
|
| 558 |
+
st.metric("Sources Covered", "15")
|
| 559 |
+
st.metric("Languages", "3")
|
| 560 |
+
|
| 561 |
+
if __name__ == "__main__":
|
| 562 |
+
main()
|
summarizer_module.py
ADDED
|
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
import re
|
| 4 |
+
from transformers import pipeline, AutoTokenizer
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
class TextSummarizer:
|
| 10 |
+
"""Text summarization with chunking for long documents"""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.summarizer = None
|
| 14 |
+
self.tokenizer = None
|
| 15 |
+
self.max_chunk_length = 1024 # Maximum tokens per chunk
|
| 16 |
+
self.max_summary_length = 150
|
| 17 |
+
self.min_summary_length = 50
|
| 18 |
+
|
| 19 |
+
self._initialize_model()
|
| 20 |
+
logger.info("TextSummarizer initialized")
|
| 21 |
+
|
| 22 |
+
def _initialize_model(self):
|
| 23 |
+
"""Initialize the summarization model"""
|
| 24 |
+
try:
|
| 25 |
+
# Try different models in order of preference
|
| 26 |
+
model_names = [
|
| 27 |
+
"facebook/bart-large-cnn",
|
| 28 |
+
"sshleifer/distilbart-cnn-12-6",
|
| 29 |
+
"t5-small"
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
for model_name in model_names:
|
| 33 |
+
try:
|
| 34 |
+
# Use CPU to avoid memory issues on Hugging Face Spaces
|
| 35 |
+
device = -1 # CPU only for Hugging Face Spaces
|
| 36 |
+
|
| 37 |
+
self.summarizer = pipeline(
|
| 38 |
+
"summarization",
|
| 39 |
+
model=model_name,
|
| 40 |
+
tokenizer=model_name,
|
| 41 |
+
device=device,
|
| 42 |
+
framework="pt"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 46 |
+
logger.info(f"Successfully loaded summarization model: {model_name}")
|
| 47 |
+
break
|
| 48 |
+
|
| 49 |
+
except Exception as e:
|
| 50 |
+
logger.warning(f"Failed to load {model_name}: {str(e)}")
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
if self.summarizer is None:
|
| 54 |
+
logger.error("Failed to load any summarization model")
|
| 55 |
+
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.error(f"Error initializing summarizer: {str(e)}")
|
| 58 |
+
|
| 59 |
+
def summarize(self, text: str, max_length: int = None, min_length: int = None) -> str:
|
| 60 |
+
"""Summarize text with automatic chunking for long documents"""
|
| 61 |
+
if not text or not text.strip():
|
| 62 |
+
return ""
|
| 63 |
+
|
| 64 |
+
if not self.summarizer:
|
| 65 |
+
return self._fallback_summarize(text)
|
| 66 |
+
|
| 67 |
+
try:
|
| 68 |
+
# Use provided lengths or defaults
|
| 69 |
+
max_len = max_length or self.max_summary_length
|
| 70 |
+
min_len = min_length or self.min_summary_length
|
| 71 |
+
|
| 72 |
+
# Check if text needs chunking
|
| 73 |
+
if self._needs_chunking(text):
|
| 74 |
+
return self._summarize_long_text(text, max_len, min_len)
|
| 75 |
+
else:
|
| 76 |
+
return self._summarize_chunk(text, max_len, min_len)
|
| 77 |
+
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.error(f"Summarization failed: {str(e)}")
|
| 80 |
+
return self._fallback_summarize(text)
|
| 81 |
+
|
| 82 |
+
def _needs_chunking(self, text: str) -> bool:
|
| 83 |
+
"""Check if text needs to be chunked"""
|
| 84 |
+
if not self.tokenizer:
|
| 85 |
+
return len(text.split()) > 300 # Rough word count threshold
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
tokens = self.tokenizer.encode(text, add_special_tokens=True)
|
| 89 |
+
return len(tokens) > self.max_chunk_length
|
| 90 |
+
except:
|
| 91 |
+
return len(text.split()) > 300
|
| 92 |
+
|
| 93 |
+
def _summarize_long_text(self, text: str, max_len: int, min_len: int) -> str:
|
| 94 |
+
"""Summarize long text by chunking"""
|
| 95 |
+
try:
|
| 96 |
+
# Split text into chunks
|
| 97 |
+
chunks = self._split_into_chunks(text)
|
| 98 |
+
|
| 99 |
+
if not chunks:
|
| 100 |
+
return self._fallback_summarize(text)
|
| 101 |
+
|
| 102 |
+
# Summarize each chunk
|
| 103 |
+
chunk_summaries = []
|
| 104 |
+
for chunk in chunks:
|
| 105 |
+
if len(chunk.strip()) > 100: # Only summarize substantial chunks
|
| 106 |
+
summary = self._summarize_chunk(
|
| 107 |
+
chunk,
|
| 108 |
+
max_length=min(max_len // len(chunks) + 20, 100),
|
| 109 |
+
min_length=20
|
| 110 |
+
)
|
| 111 |
+
if summary and summary.strip():
|
| 112 |
+
chunk_summaries.append(summary)
|
| 113 |
+
|
| 114 |
+
if not chunk_summaries:
|
| 115 |
+
return self._fallback_summarize(text)
|
| 116 |
+
|
| 117 |
+
# Combine chunk summaries
|
| 118 |
+
combined_summary = " ".join(chunk_summaries)
|
| 119 |
+
|
| 120 |
+
# If combined summary is still too long, summarize again
|
| 121 |
+
if self._needs_chunking(combined_summary) and len(chunk_summaries) > 1:
|
| 122 |
+
final_summary = self._summarize_chunk(combined_summary, max_len, min_len)
|
| 123 |
+
return final_summary if final_summary else combined_summary
|
| 124 |
+
|
| 125 |
+
return combined_summary
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.error(f"Long text summarization failed: {str(e)}")
|
| 129 |
+
return self._fallback_summarize(text)
|
| 130 |
+
|
| 131 |
+
def _summarize_chunk(self, text: str, max_length: int, min_length: int) -> str:
|
| 132 |
+
"""Summarize a single chunk of text"""
|
| 133 |
+
try:
|
| 134 |
+
if not text or len(text.strip()) < 50:
|
| 135 |
+
return text
|
| 136 |
+
|
| 137 |
+
# Clean text
|
| 138 |
+
cleaned_text = self._clean_text_for_summarization(text)
|
| 139 |
+
|
| 140 |
+
if not cleaned_text:
|
| 141 |
+
return text[:200] + "..." if len(text) > 200 else text
|
| 142 |
+
|
| 143 |
+
# Generate summary
|
| 144 |
+
result = self.summarizer(
|
| 145 |
+
cleaned_text,
|
| 146 |
+
max_length=max_length,
|
| 147 |
+
min_length=min_length,
|
| 148 |
+
do_sample=False,
|
| 149 |
+
truncation=True
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
if result and len(result) > 0 and 'summary_text' in result[0]:
|
| 153 |
+
summary = result[0]['summary_text'].strip()
|
| 154 |
+
|
| 155 |
+
# Post-process summary
|
| 156 |
+
summary = self._post_process_summary(summary)
|
| 157 |
+
|
| 158 |
+
return summary if summary else cleaned_text[:200] + "..."
|
| 159 |
+
|
| 160 |
+
return cleaned_text[:200] + "..."
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.error(f"Chunk summarization failed: {str(e)}")
|
| 164 |
+
return text[:200] + "..." if len(text) > 200 else text
|
| 165 |
+
|
| 166 |
+
def _split_into_chunks(self, text: str) -> List[str]:
|
| 167 |
+
"""Split text into manageable chunks"""
|
| 168 |
+
try:
|
| 169 |
+
# Split by paragraphs first
|
| 170 |
+
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
| 171 |
+
|
| 172 |
+
if not paragraphs:
|
| 173 |
+
paragraphs = [text]
|
| 174 |
+
|
| 175 |
+
chunks = []
|
| 176 |
+
current_chunk = ""
|
| 177 |
+
current_length = 0
|
| 178 |
+
|
| 179 |
+
for paragraph in paragraphs:
|
| 180 |
+
paragraph_length = len(paragraph.split())
|
| 181 |
+
|
| 182 |
+
# If adding this paragraph would exceed chunk size, start new chunk
|
| 183 |
+
if current_length + paragraph_length > 250 and current_chunk:
|
| 184 |
+
chunks.append(current_chunk.strip())
|
| 185 |
+
current_chunk = paragraph
|
| 186 |
+
current_length = paragraph_length
|
| 187 |
+
else:
|
| 188 |
+
if current_chunk:
|
| 189 |
+
current_chunk += "\n\n" + paragraph
|
| 190 |
+
else:
|
| 191 |
+
current_chunk = paragraph
|
| 192 |
+
current_length += paragraph_length
|
| 193 |
+
|
| 194 |
+
# Add remaining chunk
|
| 195 |
+
if current_chunk.strip():
|
| 196 |
+
chunks.append(current_chunk.strip())
|
| 197 |
+
|
| 198 |
+
# If no proper chunks, split by sentences
|
| 199 |
+
if not chunks or len(chunks) == 1 and len(chunks[0].split()) > 400:
|
| 200 |
+
return self._split_by_sentences(text)
|
| 201 |
+
|
| 202 |
+
return chunks
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
logger.error(f"Text splitting failed: {str(e)}")
|
| 206 |
+
return [text]
|
| 207 |
+
|
| 208 |
+
def _split_by_sentences(self, text: str) -> List[str]:
|
| 209 |
+
"""Split text by sentences as fallback"""
|
| 210 |
+
try:
|
| 211 |
+
sentences = re.split(r'[.!?]+\s+', text)
|
| 212 |
+
chunks = []
|
| 213 |
+
current_chunk = ""
|
| 214 |
+
|
| 215 |
+
for sentence in sentences:
|
| 216 |
+
if len((current_chunk + " " + sentence).split()) > 200:
|
| 217 |
+
if current_chunk:
|
| 218 |
+
chunks.append(current_chunk.strip())
|
| 219 |
+
current_chunk = sentence
|
| 220 |
+
else:
|
| 221 |
+
if current_chunk:
|
| 222 |
+
current_chunk += ". " + sentence
|
| 223 |
+
else:
|
| 224 |
+
current_chunk = sentence
|
| 225 |
+
|
| 226 |
+
if current_chunk.strip():
|
| 227 |
+
chunks.append(current_chunk.strip())
|
| 228 |
+
|
| 229 |
+
return chunks if chunks else [text]
|
| 230 |
+
|
| 231 |
+
except Exception as e:
|
| 232 |
+
logger.error(f"Sentence splitting failed: {str(e)}")
|
| 233 |
+
return [text]
|
| 234 |
+
|
| 235 |
+
def _clean_text_for_summarization(self, text: str) -> str:
|
| 236 |
+
"""Clean text for better summarization"""
|
| 237 |
+
if not text:
|
| 238 |
+
return ""
|
| 239 |
+
|
| 240 |
+
# Remove URLs
|
| 241 |
+
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
|
| 242 |
+
|
| 243 |
+
# Remove email addresses
|
| 244 |
+
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
|
| 245 |
+
|
| 246 |
+
# Remove excessive whitespace
|
| 247 |
+
text = re.sub(r'\s+', ' ', text)
|
| 248 |
+
|
| 249 |
+
# Remove common news artifacts
|
| 250 |
+
artifacts = [
|
| 251 |
+
r'\(Reuters\)', r'\(AP\)', r'\(Bloomberg\)', r'\(CNN\)',
|
| 252 |
+
r'-- .*$', r'Photo:.*$', r'Image:.*$', r'Video:.*$',
|
| 253 |
+
r'Subscribe.*$', r'Follow us.*$'
|
| 254 |
+
]
|
| 255 |
+
|
| 256 |
+
for artifact in artifacts:
|
| 257 |
+
text = re.sub(artifact, '', text, flags=re.IGNORECASE | re.MULTILINE)
|
| 258 |
+
|
| 259 |
+
return text.strip()
|
| 260 |
+
|
| 261 |
+
def _post_process_summary(self, summary: str) -> str:
|
| 262 |
+
"""Post-process generated summary"""
|
| 263 |
+
if not summary:
|
| 264 |
+
return ""
|
| 265 |
+
|
| 266 |
+
# Remove incomplete sentences at the end
|
| 267 |
+
sentences = re.split(r'[.!?]+', summary)
|
| 268 |
+
if len(sentences) > 1 and len(sentences[-1].strip()) < 10:
|
| 269 |
+
summary = '.'.join(sentences[:-1]) + '.'
|
| 270 |
+
|
| 271 |
+
# Capitalize first letter
|
| 272 |
+
summary = summary[0].upper() + summary[1:] if len(summary) > 1 else summary.upper()
|
| 273 |
+
|
| 274 |
+
# Ensure summary ends with punctuation
|
| 275 |
+
if summary and summary[-1] not in '.!?':
|
| 276 |
+
summary += '.'
|
| 277 |
+
|
| 278 |
+
return summary.strip()
|
| 279 |
+
|
| 280 |
+
def _fallback_summarize(self, text: str) -> str:
|
| 281 |
+
"""Fallback summarization using simple extraction"""
|
| 282 |
+
try:
|
| 283 |
+
if not text or len(text.strip()) < 50:
|
| 284 |
+
return text
|
| 285 |
+
|
| 286 |
+
# Split into sentences
|
| 287 |
+
sentences = re.split(r'[.!?]+', text)
|
| 288 |
+
sentences = [s.strip() for s in sentences if s.strip() and len(s.split()) > 5]
|
| 289 |
+
|
| 290 |
+
if not sentences:
|
| 291 |
+
return text[:200] + "..." if len(text) > 200 else text
|
| 292 |
+
|
| 293 |
+
# Take first few sentences (extractive summary)
|
| 294 |
+
num_sentences = min(3, len(sentences))
|
| 295 |
+
summary_sentences = sentences[:num_sentences]
|
| 296 |
+
|
| 297 |
+
summary = '. '.join(summary_sentences)
|
| 298 |
+
if not summary.endswith('.'):
|
| 299 |
+
summary += '.'
|
| 300 |
+
|
| 301 |
+
# If summary is too long, truncate
|
| 302 |
+
if len(summary) > 300:
|
| 303 |
+
words = summary.split()
|
| 304 |
+
summary = ' '.join(words[:40]) + '...'
|
| 305 |
+
|
| 306 |
+
return summary
|
| 307 |
+
|
| 308 |
+
except Exception as e:
|
| 309 |
+
logger.error(f"Fallback summarization failed: {str(e)}")
|
| 310 |
+
return text[:200] + "..." if len(text) > 200 else text
|
| 311 |
+
|
| 312 |
+
def batch_summarize(self, texts: List[str], **kwargs) -> List[str]:
|
| 313 |
+
"""Summarize multiple texts"""
|
| 314 |
+
summaries = []
|
| 315 |
+
|
| 316 |
+
for text in texts:
|
| 317 |
+
try:
|
| 318 |
+
summary = self.summarize(text, **kwargs)
|
| 319 |
+
summaries.append(summary)
|
| 320 |
+
except Exception as e:
|
| 321 |
+
logger.error(f"Batch summarization failed for one text: {str(e)}")
|
| 322 |
+
summaries.append(self._fallback_summarize(text))
|
| 323 |
+
|
| 324 |
+
return summaries
|
| 325 |
+
|
| 326 |
+
def get_summary_stats(self, original_text: str, summary: str) -> dict:
|
| 327 |
+
"""Get statistics about the summarization"""
|
| 328 |
+
try:
|
| 329 |
+
original_words = len(original_text.split())
|
| 330 |
+
summary_words = len(summary.split())
|
| 331 |
+
|
| 332 |
+
compression_ratio = summary_words / original_words if original_words > 0 else 0
|
| 333 |
+
|
| 334 |
+
return {
|
| 335 |
+
'original_length': original_words,
|
| 336 |
+
'summary_length': summary_words,
|
| 337 |
+
'compression_ratio': compression_ratio,
|
| 338 |
+
'compression_percentage': (1 - compression_ratio) * 100
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
except Exception as e:
|
| 342 |
+
logger.error(f"Error calculating summary stats: {str(e)}")
|
| 343 |
+
return {
|
| 344 |
+
'original_length': 0,
|
| 345 |
+
'summary_length': 0,
|
| 346 |
+
'compression_ratio': 0,
|
| 347 |
+
'compression_percentage': 0
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
# Utility functions
|
| 351 |
+
def extract_key_sentences(text: str, num_sentences: int = 3) -> List[str]:
|
| 352 |
+
"""Extract key sentences using simple heuristics"""
|
| 353 |
+
try:
|
| 354 |
+
sentences = re.split(r'[.!?]+', text)
|
| 355 |
+
sentences = [s.strip() for s in sentences if s.strip() and len(s.split()) > 5]
|
| 356 |
+
|
| 357 |
+
if not sentences:
|
| 358 |
+
return []
|
| 359 |
+
|
| 360 |
+
# Score sentences based on position and keyword density
|
| 361 |
+
scored_sentences = []
|
| 362 |
+
|
| 363 |
+
for i, sentence in enumerate(sentences):
|
| 364 |
+
score = 0
|
| 365 |
+
|
| 366 |
+
# Position bonus (earlier sentences get higher scores)
|
| 367 |
+
if i < len(sentences) * 0.3:
|
| 368 |
+
score += 3
|
| 369 |
+
elif i < len(sentences) * 0.6:
|
| 370 |
+
score += 2
|
| 371 |
+
else:
|
| 372 |
+
score += 1
|
| 373 |
+
|
| 374 |
+
# Length bonus (medium-length sentences preferred)
|
| 375 |
+
words = len(sentence.split())
|
| 376 |
+
if 10 <= words <= 25:
|
| 377 |
+
score += 2
|
| 378 |
+
elif 5 <= words <= 35:
|
| 379 |
+
score += 1
|
| 380 |
+
|
| 381 |
+
# Keyword bonus (sentences with common business/finance terms)
|
| 382 |
+
keywords = [
|
| 383 |
+
'company', 'business', 'revenue', 'profit', 'growth', 'market',
|
| 384 |
+
'financial', 'earnings', 'investment', 'stock', 'shares', 'economy'
|
| 385 |
+
]
|
| 386 |
+
|
| 387 |
+
sentence_lower = sentence.lower()
|
| 388 |
+
keyword_count = sum(1 for keyword in keywords if keyword in sentence_lower)
|
| 389 |
+
score += keyword_count
|
| 390 |
+
|
| 391 |
+
scored_sentences.append((sentence, score))
|
| 392 |
+
|
| 393 |
+
# Sort by score and return top sentences
|
| 394 |
+
scored_sentences.sort(key=lambda x: x[1], reverse=True)
|
| 395 |
+
|
| 396 |
+
return [sent[0] for sent in scored_sentences[:num_sentences]]
|
| 397 |
+
|
| 398 |
+
except Exception as e:
|
| 399 |
+
logger.error(f"Key sentence extraction failed: {str(e)}")
|
| 400 |
+
return []
|
translator_module (1).py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Dict, List, Optional
|
| 3 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
| 4 |
+
import torch
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
class MultilingualTranslator:
|
| 10 |
+
"""Multilingual translation with support for Hindi and Tamil"""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.translators = {}
|
| 14 |
+
self.language_codes = {
|
| 15 |
+
'Hindi': 'hi',
|
| 16 |
+
'Tamil': 'ta',
|
| 17 |
+
'English': 'en'
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
# Supported translation pairs
|
| 21 |
+
self.supported_pairs = {
|
| 22 |
+
'en-hi': 'Helsinki-NLP/opus-mt-en-hi',
|
| 23 |
+
'en-ta': 'Helsinki-NLP/opus-mt-en-mul', # Multilingual model for Tamil
|
| 24 |
+
'hi-en': 'Helsinki-NLP/opus-mt-hi-en',
|
| 25 |
+
'ta-en': 'Helsinki-NLP/opus-mt-mul-en'
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
self._initialize_models()
|
| 29 |
+
logger.info("MultilingualTranslator initialized")
|
| 30 |
+
|
| 31 |
+
def _initialize_models(self):
|
| 32 |
+
"""Initialize translation models on-demand"""
|
| 33 |
+
# Don't load all models at startup to save memory
|
| 34 |
+
# They will be loaded when first needed
|
| 35 |
+
logger.info("Translation models will be loaded on-demand")
|
| 36 |
+
|
| 37 |
+
def _load_translator(self, source_lang: str, target_lang: str) -> Optional[object]:
|
| 38 |
+
"""Load a specific translator model"""
|
| 39 |
+
pair_key = f"{source_lang}-{target_lang}"
|
| 40 |
+
|
| 41 |
+
if pair_key in self.translators:
|
| 42 |
+
return self.translators[pair_key]
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
model_name = self.supported_pairs.get(pair_key)
|
| 46 |
+
if not model_name:
|
| 47 |
+
logger.error(f"No model available for {source_lang} -> {target_lang}")
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
# Use CPU for Hugging Face Spaces compatibility
|
| 51 |
+
device = -1 # CPU only
|
| 52 |
+
|
| 53 |
+
translator = pipeline(
|
| 54 |
+
"translation",
|
| 55 |
+
model=model_name,
|
| 56 |
+
device=device,
|
| 57 |
+
framework="pt"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
self.translators[pair_key] = translator
|
| 61 |
+
logger.info(f"Loaded translator for {source_lang} -> {target_lang}")
|
| 62 |
+
|
| 63 |
+
return translator
|
| 64 |
+
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.error(f"Failed to load translator {pair_key}: {str(e)}")
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
+
def translate(self, text: str, target_lang: str, source_lang: str = 'English') -> str:
|
| 70 |
+
"""Translate text to target language"""
|
| 71 |
+
if not text or not text.strip():
|
| 72 |
+
return ""
|
| 73 |
+
|
| 74 |
+
# Get language codes
|
| 75 |
+
source_code = self.language_codes.get(source_lang, 'en')
|
| 76 |
+
target_code = self.language_codes.get(target_lang, target_lang.lower()[:2])
|
| 77 |
+
|
| 78 |
+
# If source and target are the same, return original text
|
| 79 |
+
if source_code == target_code:
|
| 80 |
+
return text
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
# Load the appropriate translator
|
| 84 |
+
translator = self._load_translator(source_code, target_code)
|
| 85 |
+
|
| 86 |
+
if not translator:
|
| 87 |
+
return self._fallback_translate(text, target_lang)
|
| 88 |
+
|
| 89 |
+
# Clean and prepare text
|
| 90 |
+
cleaned_text = self._prepare_text_for_translation(text)
|
| 91 |
+
|
| 92 |
+
if not cleaned_text:
|
| 93 |
+
return text
|
| 94 |
+
|
| 95 |
+
# Split long text into chunks for translation
|
| 96 |
+
if len(cleaned_text.split()) > 200:
|
| 97 |
+
return self._translate_long_text(cleaned_text, translator)
|
| 98 |
+
else:
|
| 99 |
+
return self._translate_chunk(cleaned_text, translator)
|
| 100 |
+
|
| 101 |
+
except Exception as e:
|
| 102 |
+
logger.error(f"Translation failed: {str(e)}")
|
| 103 |
+
return self._fallback_translate(text, target_lang)
|
| 104 |
+
|
| 105 |
+
def _translate_chunk(self, text: str, translator) -> str:
|
| 106 |
+
"""Translate a single chunk of text"""
|
| 107 |
+
try:
|
| 108 |
+
result = translator(text, max_length=512)
|
| 109 |
+
|
| 110 |
+
if result and len(result) > 0:
|
| 111 |
+
translated = result[0].get('translation_text', text)
|
| 112 |
+
return self._post_process_translation(translated)
|
| 113 |
+
|
| 114 |
+
return text
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"Chunk translation failed: {str(e)}")
|
| 118 |
+
return text
|
| 119 |
+
|
| 120 |
+
def _translate_long_text(self, text: str, translator) -> str:
|
| 121 |
+
"""Translate long text by splitting into chunks"""
|
| 122 |
+
try:
|
| 123 |
+
# Split by sentences
|
| 124 |
+
sentences = self._split_into_sentences(text)
|
| 125 |
+
|
| 126 |
+
if not sentences:
|
| 127 |
+
return text
|
| 128 |
+
|
| 129 |
+
translated_sentences = []
|
| 130 |
+
current_chunk = ""
|
| 131 |
+
|
| 132 |
+
for sentence in sentences:
|
| 133 |
+
# If adding this sentence would make chunk too long, translate current chunk
|
| 134 |
+
if len((current_chunk + " " + sentence).split()) > 150 and current_chunk:
|
| 135 |
+
translated = self._translate_chunk(current_chunk, translator)
|
| 136 |
+
translated_sentences.append(translated)
|
| 137 |
+
current_chunk = sentence
|
| 138 |
+
else:
|
| 139 |
+
if current_chunk:
|
| 140 |
+
current_chunk += " " + sentence
|
| 141 |
+
else:
|
| 142 |
+
current_chunk = sentence
|
| 143 |
+
|
| 144 |
+
# Translate remaining chunk
|
| 145 |
+
if current_chunk:
|
| 146 |
+
translated = self._translate_chunk(current_chunk, translator)
|
| 147 |
+
translated_sentences.append(translated)
|
| 148 |
+
|
| 149 |
+
return " ".join(translated_sentences)
|
| 150 |
+
|
| 151 |
+
except Exception as e:
|
| 152 |
+
logger.error(f"Long text translation failed: {str(e)}")
|
| 153 |
+
return text
|
| 154 |
+
|
| 155 |
+
def _split_into_sentences(self, text: str) -> List[str]:
|
| 156 |
+
"""Split text into sentences"""
|
| 157 |
+
try:
|
| 158 |
+
# Simple sentence splitting
|
| 159 |
+
sentences = re.split(r'[.!?]+\s+', text)
|
| 160 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 161 |
+
|
| 162 |
+
return sentences
|
| 163 |
+
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.error(f"Sentence splitting failed: {str(e)}")
|
| 166 |
+
return [text]
|
| 167 |
+
|
| 168 |
+
def _prepare_text_for_translation(self, text: str) -> str:
|
| 169 |
+
"""Prepare text for translation"""
|
| 170 |
+
if not text:
|
| 171 |
+
return ""
|
| 172 |
+
|
| 173 |
+
# Remove URLs
|
| 174 |
+
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
|
| 175 |
+
|
| 176 |
+
# Remove email addresses
|
| 177 |
+
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
|
| 178 |
+
|
| 179 |
+
# Clean excessive whitespace
|
| 180 |
+
text = re.sub(r'\s+', ' ', text)
|
| 181 |
+
|
| 182 |
+
# Remove special characters that might cause issues
|
| 183 |
+
text = re.sub(r'[^\w\s.,!?;:\-\'"()/%$]', '', text)
|
| 184 |
+
|
| 185 |
+
return text.strip()
|
| 186 |
+
|
| 187 |
+
def _post_process_translation(self, text: str) -> str:
|
| 188 |
+
"""Post-process translated text"""
|
| 189 |
+
if not text:
|
| 190 |
+
return ""
|
| 191 |
+
|
| 192 |
+
# Clean up extra spaces
|
| 193 |
+
text = re.sub(r'\s+', ' ', text)
|
| 194 |
+
|
| 195 |
+
# Capitalize first letter if it's a sentence
|
| 196 |
+
if text and len(text) > 1:
|
| 197 |
+
text = text[0].upper() + text[1:]
|
| 198 |
+
|
| 199 |
+
return text.strip()
|
| 200 |
+
|
| 201 |
+
def _fallback_translate(self, text: str, target_lang: str) -> str:
|
| 202 |
+
"""Fallback translation with basic text processing"""
|
| 203 |
+
logger.warning(f"Using fallback translation for {target_lang}")
|
| 204 |
+
|
| 205 |
+
# For demonstration purposes, we'll return the original text with a note
|
| 206 |
+
# In a production system, you might use a different translation service
|
| 207 |
+
if target_lang.lower() in ['hindi', 'hi']:
|
| 208 |
+
return f"[Hindi] {text}"
|
| 209 |
+
elif target_lang.lower() in ['tamil', 'ta']:
|
| 210 |
+
return f"[Tamil] {text}"
|
| 211 |
+
else:
|
| 212 |
+
return text
|
| 213 |
+
|
| 214 |
+
def batch_translate(self, texts: List[str], target_lang: str, source_lang: str = 'English') -> List[str]:
|
| 215 |
+
"""Translate multiple texts"""
|
| 216 |
+
translations = []
|
| 217 |
+
|
| 218 |
+
for text in texts:
|
| 219 |
+
try:
|
| 220 |
+
translation = self.translate(text, target_lang, source_lang)
|
| 221 |
+
translations.append(translation)
|
| 222 |
+
except Exception as e:
|
| 223 |
+
logger.error(f"Batch translation failed for one text: {str(e)}")
|
| 224 |
+
translations.append(self._fallback_translate(text, target_lang))
|
| 225 |
+
|
| 226 |
+
return translations
|
| 227 |
+
|
| 228 |
+
def detect_language(self, text: str) -> str:
|
| 229 |
+
"""Simple language detection (basic implementation)"""
|
| 230 |
+
try:
|
| 231 |
+
# Basic detection using character patterns
|
| 232 |
+
if not text:
|
| 233 |
+
return 'en'
|
| 234 |
+
|
| 235 |
+
# Check for Devanagari script (Hindi)
|
| 236 |
+
if re.search(r'[\u0900-\u097F]', text):
|
| 237 |
+
return 'hi'
|
| 238 |
+
|
| 239 |
+
# Check for Tamil script
|
| 240 |
+
if re.search(r'[\u0B80-\u0BFF]', text):
|
| 241 |
+
return 'ta'
|
| 242 |
+
|
| 243 |
+
# Default to English
|
| 244 |
+
return 'en'
|
| 245 |
+
|
| 246 |
+
except Exception as e:
|
| 247 |
+
logger.error(f"Language detection failed: {str(e)}")
|
| 248 |
+
return 'en'
|
| 249 |
+
|
| 250 |
+
def get_supported_languages(self) -> List[str]:
|
| 251 |
+
"""Get list of supported languages"""
|
| 252 |
+
return list(self.language_codes.keys())
|
| 253 |
+
|
| 254 |
+
def is_translation_available(self, source_lang: str, target_lang: str) -> bool:
|
| 255 |
+
"""Check if translation is available between two languages"""
|
| 256 |
+
source_code = self.language_codes.get(source_lang, source_lang.lower()[:2])
|
| 257 |
+
target_code = self.language_codes.get(target_lang, target_lang.lower()[:2])
|
| 258 |
+
|
| 259 |
+
pair_key = f"{source_code}-{target_code}"
|
| 260 |
+
return pair_key in self.supported_pairs
|
| 261 |
+
|
| 262 |
+
def translate_with_confidence(self, text: str, target_lang: str, source_lang: str = 'English') -> Dict[str, any]:
|
| 263 |
+
"""Translate text and return result with confidence metrics"""
|
| 264 |
+
try:
|
| 265 |
+
translated_text = self.translate(text, target_lang, source_lang)
|
| 266 |
+
|
| 267 |
+
# Simple confidence calculation based on text characteristics
|
| 268 |
+
confidence = self._calculate_translation_confidence(text, translated_text, target_lang)
|
| 269 |
+
|
| 270 |
+
return {
|
| 271 |
+
'original_text': text,
|
| 272 |
+
'translated_text': translated_text,
|
| 273 |
+
'source_language': source_lang,
|
| 274 |
+
'target_language': target_lang,
|
| 275 |
+
'confidence': confidence,
|
| 276 |
+
'method': 'neural_translation' if translated_text != text else 'fallback'
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
except Exception as e:
|
| 280 |
+
logger.error(f"Translation with confidence failed: {str(e)}")
|
| 281 |
+
return {
|
| 282 |
+
'original_text': text,
|
| 283 |
+
'translated_text': text,
|
| 284 |
+
'source_language': source_lang,
|
| 285 |
+
'target_language': target_lang,
|
| 286 |
+
'confidence': 0.0,
|
| 287 |
+
'method': 'error',
|
| 288 |
+
'error': str(e)
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
def _calculate_translation_confidence(self, original: str, translated: str, target_lang: str) -> float:
|
| 292 |
+
"""Calculate a simple confidence score for translation"""
|
| 293 |
+
try:
|
| 294 |
+
# If translation failed (same as original), low confidence
|
| 295 |
+
if original == translated and target_lang != 'English':
|
| 296 |
+
return 0.2
|
| 297 |
+
|
| 298 |
+
# If text is very short, moderate confidence
|
| 299 |
+
if len(original.split()) < 5:
|
| 300 |
+
return 0.7
|
| 301 |
+
|
| 302 |
+
# If translation is significantly different in length, lower confidence
|
| 303 |
+
original_len = len(original.split())
|
| 304 |
+
translated_len = len(translated.split())
|
| 305 |
+
|
| 306 |
+
length_ratio = min(original_len, translated_len) / max(original_len, translated_len)
|
| 307 |
+
|
| 308 |
+
if length_ratio < 0.5:
|
| 309 |
+
return 0.6
|
| 310 |
+
elif length_ratio < 0.7:
|
| 311 |
+
return 0.8
|
| 312 |
+
else:
|
| 313 |
+
return 0.9
|
| 314 |
+
|
| 315 |
+
except Exception as e:
|
| 316 |
+
logger.error(f"Confidence calculation failed: {str(e)}")
|
| 317 |
+
return 0.5
|
| 318 |
+
|
| 319 |
+
# Utility functions
|
| 320 |
+
def get_language_name(code: str) -> str:
|
| 321 |
+
"""Get full language name from code"""
|
| 322 |
+
code_to_name = {
|
| 323 |
+
'en': 'English',
|
| 324 |
+
'hi': 'Hindi',
|
| 325 |
+
'ta': 'Tamil'
|
| 326 |
+
}
|
| 327 |
+
return code_to_name.get(code.lower(), code)
|
| 328 |
+
|
| 329 |
+
def get_language_code(name: str) -> str:
|
| 330 |
+
"""Get language code from name"""
|
| 331 |
+
name_to_code = {
|
| 332 |
+
'english': 'en',
|
| 333 |
+
'hindi': 'hi',
|
| 334 |
+
'tamil': 'ta'
|
| 335 |
+
}
|
| 336 |
+
return name_to_code.get(name.lower(), name.lower()[:2])
|
tts_module.py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import tempfile
|
| 4 |
+
from typing import Dict, List, Optional
|
| 5 |
+
import hashlib
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
# gTTS for text-to-speech
|
| 9 |
+
try:
|
| 10 |
+
from gtts import gTTS
|
| 11 |
+
GTTS_AVAILABLE = True
|
| 12 |
+
except ImportError:
|
| 13 |
+
GTTS_AVAILABLE = False
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
class AudioGenerator:
|
| 18 |
+
"""Text-to-speech audio generation with multilingual support"""
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self.supported_languages = {
|
| 22 |
+
'English': 'en',
|
| 23 |
+
'Hindi': 'hi',
|
| 24 |
+
'Tamil': 'ta'
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
# Audio cache directory
|
| 28 |
+
self.cache_dir = tempfile.mkdtemp(prefix='news_audio_')
|
| 29 |
+
self.audio_cache = {}
|
| 30 |
+
|
| 31 |
+
logger.info(f"AudioGenerator initialized with cache directory: {self.cache_dir}")
|
| 32 |
+
|
| 33 |
+
if not GTTS_AVAILABLE:
|
| 34 |
+
logger.warning("gTTS not available. Audio generation will be limited.")
|
| 35 |
+
|
| 36 |
+
def generate_audio(self, text: str, language: str = 'English', output_file: str = None) -> Optional[str]:
|
| 37 |
+
"""Generate audio from text"""
|
| 38 |
+
if not text or not text.strip():
|
| 39 |
+
logger.warning("Empty text provided for audio generation")
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
if not GTTS_AVAILABLE:
|
| 43 |
+
logger.error("gTTS not available for audio generation")
|
| 44 |
+
return None
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
# Get language code
|
| 48 |
+
lang_code = self.supported_languages.get(language, 'en')
|
| 49 |
+
|
| 50 |
+
# Create cache key
|
| 51 |
+
cache_key = self._create_cache_key(text, language)
|
| 52 |
+
|
| 53 |
+
# Check cache first
|
| 54 |
+
if cache_key in self.audio_cache:
|
| 55 |
+
cached_file = self.audio_cache[cache_key]
|
| 56 |
+
if os.path.exists(cached_file):
|
| 57 |
+
logger.info(f"Using cached audio for {language}")
|
| 58 |
+
return cached_file
|
| 59 |
+
|
| 60 |
+
# Generate output filename if not provided
|
| 61 |
+
if not output_file:
|
| 62 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 63 |
+
output_file = os.path.join(self.cache_dir, f"audio_{lang_code}_{timestamp}.mp3")
|
| 64 |
+
elif not os.path.dirname(output_file):
|
| 65 |
+
output_file = os.path.join(self.cache_dir, output_file)
|
| 66 |
+
|
| 67 |
+
# Prepare text for TTS
|
| 68 |
+
clean_text = self._prepare_text_for_tts(text)
|
| 69 |
+
|
| 70 |
+
if not clean_text:
|
| 71 |
+
logger.warning("No valid text for TTS after cleaning")
|
| 72 |
+
return None
|
| 73 |
+
|
| 74 |
+
# Generate audio using gTTS
|
| 75 |
+
if lang_code in ['en', 'hi']:
|
| 76 |
+
# gTTS supports English and Hindi directly
|
| 77 |
+
tts = gTTS(text=clean_text, lang=lang_code, slow=False)
|
| 78 |
+
elif lang_code == 'ta':
|
| 79 |
+
# For Tamil, use English as fallback or try Tamil if available
|
| 80 |
+
try:
|
| 81 |
+
tts = gTTS(text=clean_text, lang='ta', slow=False)
|
| 82 |
+
except:
|
| 83 |
+
logger.warning("Tamil not supported in gTTS, using English")
|
| 84 |
+
tts = gTTS(text=clean_text, lang='en', slow=False)
|
| 85 |
+
else:
|
| 86 |
+
# Default to English
|
| 87 |
+
tts = gTTS(text=clean_text, lang='en', slow=False)
|
| 88 |
+
|
| 89 |
+
# Save audio file
|
| 90 |
+
tts.save(output_file)
|
| 91 |
+
|
| 92 |
+
# Verify file was created
|
| 93 |
+
if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
|
| 94 |
+
# Cache the result
|
| 95 |
+
self.audio_cache[cache_key] = output_file
|
| 96 |
+
|
| 97 |
+
logger.info(f"Audio generated successfully: {output_file}")
|
| 98 |
+
return output_file
|
| 99 |
+
else:
|
| 100 |
+
logger.error("Audio file was not created or is empty")
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logger.error(f"Audio generation failed: {str(e)}")
|
| 105 |
+
return None
|
| 106 |
+
|
| 107 |
+
def _create_cache_key(self, text: str, language: str) -> str:
|
| 108 |
+
"""Create a cache key for the text and language combination"""
|
| 109 |
+
try:
|
| 110 |
+
combined = f"{text[:500]}_{language}" # Use first 500 chars to avoid very long keys
|
| 111 |
+
return hashlib.md5(combined.encode()).hexdigest()
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logger.error(f"Cache key creation failed: {str(e)}")
|
| 114 |
+
return f"default_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 115 |
+
|
| 116 |
+
def _prepare_text_for_tts(self, text: str) -> str:
|
| 117 |
+
"""Prepare text for text-to-speech conversion"""
|
| 118 |
+
if not text:
|
| 119 |
+
return ""
|
| 120 |
+
|
| 121 |
+
# Remove or replace problematic characters
|
| 122 |
+
import re
|
| 123 |
+
|
| 124 |
+
# Remove URLs
|
| 125 |
+
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
|
| 126 |
+
|
| 127 |
+
# Remove email addresses
|
| 128 |
+
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
|
| 129 |
+
|
| 130 |
+
# Replace multiple spaces with single space
|
| 131 |
+
text = re.sub(r'\s+', ' ', text)
|
| 132 |
+
|
| 133 |
+
# Remove excessive punctuation
|
| 134 |
+
text = re.sub(r'[.]{3,}', '...', text)
|
| 135 |
+
text = re.sub(r'[!]{2,}', '!', text)
|
| 136 |
+
text = re.sub(r'[?]{2,}', '?', text)
|
| 137 |
+
|
| 138 |
+
# Remove parenthetical citations and references
|
| 139 |
+
text = re.sub(r'\([^)]*\)', '', text)
|
| 140 |
+
text = re.sub(r'\[[^\]]*\]', '', text)
|
| 141 |
+
|
| 142 |
+
# Limit text length for TTS (gTTS has limits)
|
| 143 |
+
max_length = 5000 # Characters
|
| 144 |
+
if len(text) > max_length:
|
| 145 |
+
# Try to cut at sentence boundary
|
| 146 |
+
sentences = re.split(r'[.!?]+', text[:max_length])
|
| 147 |
+
if len(sentences) > 1:
|
| 148 |
+
text = '. '.join(sentences[:-1]) + '.'
|
| 149 |
+
else:
|
| 150 |
+
text = text[:max_length] + '...'
|
| 151 |
+
|
| 152 |
+
return text.strip()
|
| 153 |
+
|
| 154 |
+
def generate_batch_audio(self, texts: Dict[str, str], language: str = 'English') -> Dict[str, str]:
|
| 155 |
+
"""Generate audio for multiple texts"""
|
| 156 |
+
results = {}
|
| 157 |
+
|
| 158 |
+
for key, text in texts.items():
|
| 159 |
+
try:
|
| 160 |
+
output_file = f"audio_{key}_{language.lower()}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
|
| 161 |
+
audio_file = self.generate_audio(text, language, output_file)
|
| 162 |
+
results[key] = audio_file
|
| 163 |
+
except Exception as e:
|
| 164 |
+
logger.error(f"Batch audio generation failed for {key}: {str(e)}")
|
| 165 |
+
results[key] = None
|
| 166 |
+
|
| 167 |
+
return results
|
| 168 |
+
|
| 169 |
+
def generate_summary_audio(self, articles: List[Dict], languages: List[str] = None) -> Dict[str, str]:
|
| 170 |
+
"""Generate audio summaries for articles in multiple languages"""
|
| 171 |
+
if languages is None:
|
| 172 |
+
languages = ['English']
|
| 173 |
+
|
| 174 |
+
audio_files = {}
|
| 175 |
+
|
| 176 |
+
try:
|
| 177 |
+
# Create overall summary text
|
| 178 |
+
summary_text = self._create_audio_summary(articles)
|
| 179 |
+
|
| 180 |
+
if not summary_text:
|
| 181 |
+
logger.warning("No summary text created for audio")
|
| 182 |
+
return audio_files
|
| 183 |
+
|
| 184 |
+
# Generate audio for each language
|
| 185 |
+
for language in languages:
|
| 186 |
+
if language in self.supported_languages:
|
| 187 |
+
try:
|
| 188 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 189 |
+
output_file = f"summary_{language.lower()}_{timestamp}.mp3"
|
| 190 |
+
|
| 191 |
+
audio_file = self.generate_audio(summary_text, language, output_file)
|
| 192 |
+
|
| 193 |
+
if audio_file:
|
| 194 |
+
audio_files[language] = audio_file
|
| 195 |
+
else:
|
| 196 |
+
logger.warning(f"Failed to generate audio for {language}")
|
| 197 |
+
|
| 198 |
+
except Exception as e:
|
| 199 |
+
logger.error(f"Audio generation failed for {language}: {str(e)}")
|
| 200 |
+
continue
|
| 201 |
+
else:
|
| 202 |
+
logger.warning(f"Language {language} not supported for audio")
|
| 203 |
+
|
| 204 |
+
return audio_files
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.error(f"Summary audio generation failed: {str(e)}")
|
| 208 |
+
return audio_files
|
| 209 |
+
|
| 210 |
+
def _create_audio_summary(self, articles: List[Dict]) -> str:
|
| 211 |
+
"""Create a comprehensive audio summary from articles"""
|
| 212 |
+
try:
|
| 213 |
+
if not articles:
|
| 214 |
+
return ""
|
| 215 |
+
|
| 216 |
+
# Calculate sentiment distribution
|
| 217 |
+
positive_count = sum(1 for article in articles if article.get('sentiment', {}).get('compound', 0) > 0.1)
|
| 218 |
+
negative_count = sum(1 for article in articles if article.get('sentiment', {}).get('compound', 0) < -0.1)
|
| 219 |
+
neutral_count = len(articles) - positive_count - negative_count
|
| 220 |
+
|
| 221 |
+
# Start building summary
|
| 222 |
+
summary_parts = []
|
| 223 |
+
|
| 224 |
+
# Opening
|
| 225 |
+
summary_parts.append(f"News analysis summary for {len(articles)} articles.")
|
| 226 |
+
|
| 227 |
+
# Sentiment overview
|
| 228 |
+
if positive_count > negative_count:
|
| 229 |
+
summary_parts.append(f"Overall sentiment is predominantly positive, with {positive_count} positive articles, {negative_count} negative, and {neutral_count} neutral.")
|
| 230 |
+
elif negative_count > positive_count:
|
| 231 |
+
summary_parts.append(f"Overall sentiment is predominantly negative, with {negative_count} negative articles, {positive_count} positive, and {neutral_count} neutral.")
|
| 232 |
+
else:
|
| 233 |
+
summary_parts.append(f"Sentiment is mixed with balanced coverage across {positive_count} positive, {negative_count} negative, and {neutral_count} neutral articles.")
|
| 234 |
+
|
| 235 |
+
# Top stories
|
| 236 |
+
# Most positive story
|
| 237 |
+
positive_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0), reverse=True)
|
| 238 |
+
if positive_articles and positive_articles[0].get('sentiment', {}).get('compound', 0) > 0.1:
|
| 239 |
+
top_positive = positive_articles[0]
|
| 240 |
+
summary_parts.append(f"Most positive coverage: {top_positive.get('title', '')[:100]}")
|
| 241 |
+
|
| 242 |
+
# Most negative story
|
| 243 |
+
negative_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0))
|
| 244 |
+
if negative_articles and negative_articles[0].get('sentiment', {}).get('compound', 0) < -0.1:
|
| 245 |
+
top_negative = negative_articles[0]
|
| 246 |
+
summary_parts.append(f"Most concerning coverage: {top_negative.get('title', '')[:100]}")
|
| 247 |
+
|
| 248 |
+
# Recent developments (if we have dates)
|
| 249 |
+
recent_articles = [a for a in articles if a.get('date')]
|
| 250 |
+
if recent_articles:
|
| 251 |
+
recent_articles.sort(key=lambda x: x.get('date', ''), reverse=True)
|
| 252 |
+
if len(recent_articles) > 0:
|
| 253 |
+
summary_parts.append(f"Latest development: {recent_articles[0].get('title', '')[:100]}")
|
| 254 |
+
|
| 255 |
+
# Closing
|
| 256 |
+
summary_parts.append("This concludes the news analysis summary.")
|
| 257 |
+
|
| 258 |
+
# Join all parts
|
| 259 |
+
full_summary = " ".join(summary_parts)
|
| 260 |
+
|
| 261 |
+
# Ensure reasonable length
|
| 262 |
+
if len(full_summary) > 1000:
|
| 263 |
+
# Truncate to first few sentences
|
| 264 |
+
sentences = full_summary.split('. ')
|
| 265 |
+
truncated = '. '.join(sentences[:8]) + '.'
|
| 266 |
+
return truncated
|
| 267 |
+
|
| 268 |
+
return full_summary
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
logger.error(f"Audio summary creation failed: {str(e)}")
|
| 272 |
+
return f"Analysis complete for {len(articles)} articles with mixed sentiment coverage."
|
| 273 |
+
|
| 274 |
+
def cleanup_cache(self, max_age_hours: int = 24):
|
| 275 |
+
"""Clean up old audio files from cache"""
|
| 276 |
+
try:
|
| 277 |
+
if not os.path.exists(self.cache_dir):
|
| 278 |
+
return
|
| 279 |
+
|
| 280 |
+
current_time = datetime.now().timestamp()
|
| 281 |
+
max_age_seconds = max_age_hours * 3600
|
| 282 |
+
|
| 283 |
+
removed_count = 0
|
| 284 |
+
|
| 285 |
+
for filename in os.listdir(self.cache_dir):
|
| 286 |
+
filepath = os.path.join(self.cache_dir, filename)
|
| 287 |
+
|
| 288 |
+
if os.path.isfile(filepath):
|
| 289 |
+
file_age = current_time - os.path.getmtime(filepath)
|
| 290 |
+
|
| 291 |
+
if file_age > max_age_seconds:
|
| 292 |
+
try:
|
| 293 |
+
os.remove(filepath)
|
| 294 |
+
removed_count += 1
|
| 295 |
+
|
| 296 |
+
# Remove from cache dict as well
|
| 297 |
+
cache_keys_to_remove = [k for k, v in self.audio_cache.items() if v == filepath]
|
| 298 |
+
for key in cache_keys_to_remove:
|
| 299 |
+
del self.audio_cache[key]
|
| 300 |
+
|
| 301 |
+
except Exception as e:
|
| 302 |
+
logger.error(f"Failed to remove old audio file {filepath}: {str(e)}")
|
| 303 |
+
|
| 304 |
+
if removed_count > 0:
|
| 305 |
+
logger.info(f"Cleaned up {removed_count} old audio files")
|
| 306 |
+
|
| 307 |
+
except Exception as e:
|
| 308 |
+
logger.error(f"Cache cleanup failed: {str(e)}")
|
| 309 |
+
|
| 310 |
+
def get_cache_info(self) -> Dict[str, any]:
|
| 311 |
+
"""Get information about the audio cache"""
|
| 312 |
+
try:
|
| 313 |
+
cache_info = {
|
| 314 |
+
'cache_directory': self.cache_dir,
|
| 315 |
+
'cached_files': len(self.audio_cache),
|
| 316 |
+
'supported_languages': list(self.supported_languages.keys()),
|
| 317 |
+
'gtts_available': GTTS_AVAILABLE
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
if os.path.exists(self.cache_dir):
|
| 321 |
+
files = [f for f in os.listdir(self.cache_dir) if f.endswith('.mp3')]
|
| 322 |
+
cache_info['physical_files'] = len(files)
|
| 323 |
+
|
| 324 |
+
total_size = sum(os.path.getsize(os.path.join(self.cache_dir, f)) for f in files)
|
| 325 |
+
cache_info['total_size_bytes'] = total_size
|
| 326 |
+
cache_info['total_size_mb'] = round(total_size / (1024 * 1024), 2)
|
| 327 |
+
|
| 328 |
+
return cache_info
|
| 329 |
+
|
| 330 |
+
except Exception as e:
|
| 331 |
+
logger.error(f"Cache info retrieval failed: {str(e)}")
|
| 332 |
+
return {'error': str(e)}
|
| 333 |
+
|
| 334 |
+
def is_language_supported(self, language: str) -> bool:
|
| 335 |
+
"""Check if a language is supported for audio generation"""
|
| 336 |
+
return language in self.supported_languages and GTTS_AVAILABLE
|
utils_module (1).py
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import pickle
|
| 5 |
+
import hashlib
|
| 6 |
+
from datetime import datetime, timedelta
|
| 7 |
+
from typing import Dict, Any, Optional, List
|
| 8 |
+
import tempfile
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
def setup_logging():
|
| 12 |
+
"""Setup logging configuration"""
|
| 13 |
+
logging.basicConfig(
|
| 14 |
+
level=logging.INFO,
|
| 15 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 16 |
+
handlers=[
|
| 17 |
+
logging.StreamHandler(sys.stdout),
|
| 18 |
+
logging.FileHandler('news_analyzer.log')
|
| 19 |
+
]
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# Reduce noise from transformers and other libraries
|
| 23 |
+
logging.getLogger("transformers").setLevel(logging.WARNING)
|
| 24 |
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
| 25 |
+
logging.getLogger("requests").setLevel(logging.WARNING)
|
| 26 |
+
|
| 27 |
+
def load_config() -> Dict[str, Any]:
|
| 28 |
+
"""Load application configuration"""
|
| 29 |
+
default_config = {
|
| 30 |
+
'max_articles': 50,
|
| 31 |
+
'cache_ttl_hours': 6,
|
| 32 |
+
'supported_languages': ['English', 'Hindi', 'Tamil'],
|
| 33 |
+
'sentiment_models': ['VADER', 'Loughran-McDonald', 'FinBERT'],
|
| 34 |
+
'summarization_max_length': 150,
|
| 35 |
+
'summarization_min_length': 50,
|
| 36 |
+
'audio_enabled': True,
|
| 37 |
+
'translation_enabled': True,
|
| 38 |
+
'keyword_extraction_enabled': True,
|
| 39 |
+
'max_keywords': 20,
|
| 40 |
+
'debug_mode': False
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Try to load config from file if it exists
|
| 44 |
+
config_file = 'config.json'
|
| 45 |
+
if os.path.exists(config_file):
|
| 46 |
+
try:
|
| 47 |
+
with open(config_file, 'r') as f:
|
| 48 |
+
file_config = json.load(f)
|
| 49 |
+
default_config.update(file_config)
|
| 50 |
+
except Exception as e:
|
| 51 |
+
logging.error(f"Failed to load config file: {str(e)}")
|
| 52 |
+
|
| 53 |
+
return default_config
|
| 54 |
+
|
| 55 |
+
class CacheManager:
|
| 56 |
+
"""Simple file-based caching system"""
|
| 57 |
+
|
| 58 |
+
def __init__(self, cache_dir: str = None):
|
| 59 |
+
self.cache_dir = cache_dir or tempfile.mkdtemp(prefix='news_cache_')
|
| 60 |
+
self.ensure_cache_dir()
|
| 61 |
+
|
| 62 |
+
logging.info(f"Cache manager initialized with directory: {self.cache_dir}")
|
| 63 |
+
|
| 64 |
+
def ensure_cache_dir(self):
|
| 65 |
+
"""Ensure cache directory exists"""
|
| 66 |
+
try:
|
| 67 |
+
os.makedirs(self.cache_dir, exist_ok=True)
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logging.error(f"Failed to create cache directory: {str(e)}")
|
| 70 |
+
|
| 71 |
+
def _get_cache_key(self, key: str) -> str:
|
| 72 |
+
"""Generate a safe cache key"""
|
| 73 |
+
return hashlib.md5(key.encode()).hexdigest()
|
| 74 |
+
|
| 75 |
+
def get(self, key: str, ttl_hours: int = 6) -> Optional[Any]:
|
| 76 |
+
"""Get item from cache"""
|
| 77 |
+
try:
|
| 78 |
+
cache_key = self._get_cache_key(key)
|
| 79 |
+
cache_file = os.path.join(self.cache_dir, f"{cache_key}.pkl")
|
| 80 |
+
|
| 81 |
+
if not os.path.exists(cache_file):
|
| 82 |
+
return None
|
| 83 |
+
|
| 84 |
+
# Check if cache is expired
|
| 85 |
+
file_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
|
| 86 |
+
if file_age > ttl_hours * 3600:
|
| 87 |
+
try:
|
| 88 |
+
os.remove(cache_file)
|
| 89 |
+
except:
|
| 90 |
+
pass
|
| 91 |
+
return None
|
| 92 |
+
|
| 93 |
+
# Load cached data
|
| 94 |
+
with open(cache_file, 'rb') as f:
|
| 95 |
+
data = pickle.load(f)
|
| 96 |
+
|
| 97 |
+
logging.debug(f"Cache hit for key: {key[:50]}...")
|
| 98 |
+
return data
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logging.error(f"Cache get failed for key {key}: {str(e)}")
|
| 102 |
+
return None
|
| 103 |
+
|
| 104 |
+
def set(self, key: str, value: Any) -> bool:
|
| 105 |
+
"""Set item in cache"""
|
| 106 |
+
try:
|
| 107 |
+
cache_key = self._get_cache_key(key)
|
| 108 |
+
cache_file = os.path.join(self.cache_dir, f"{cache_key}.pkl")
|
| 109 |
+
|
| 110 |
+
with open(cache_file, 'wb') as f:
|
| 111 |
+
pickle.dump(value, f)
|
| 112 |
+
|
| 113 |
+
logging.debug(f"Cache set for key: {key[:50]}...")
|
| 114 |
+
return True
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logging.error(f"Cache set failed for key {key}: {str(e)}")
|
| 118 |
+
return False
|
| 119 |
+
|
| 120 |
+
def clear_expired(self, ttl_hours: int = 24):
|
| 121 |
+
"""Clear expired cache entries"""
|
| 122 |
+
try:
|
| 123 |
+
current_time = datetime.now().timestamp()
|
| 124 |
+
max_age = ttl_hours * 3600
|
| 125 |
+
cleared_count = 0
|
| 126 |
+
|
| 127 |
+
for filename in os.listdir(self.cache_dir):
|
| 128 |
+
if filename.endswith('.pkl'):
|
| 129 |
+
filepath = os.path.join(self.cache_dir, filename)
|
| 130 |
+
file_age = current_time - os.path.getmtime(filepath)
|
| 131 |
+
|
| 132 |
+
if file_age > max_age:
|
| 133 |
+
try:
|
| 134 |
+
os.remove(filepath)
|
| 135 |
+
cleared_count += 1
|
| 136 |
+
except Exception as e:
|
| 137 |
+
logging.error(f"Failed to remove cache file {filepath}: {str(e)}")
|
| 138 |
+
|
| 139 |
+
if cleared_count > 0:
|
| 140 |
+
logging.info(f"Cleared {cleared_count} expired cache entries")
|
| 141 |
+
|
| 142 |
+
except Exception as e:
|
| 143 |
+
logging.error(f"Cache cleanup failed: {str(e)}")
|
| 144 |
+
|
| 145 |
+
# Global cache instance
|
| 146 |
+
cache_manager = CacheManager()
|
| 147 |
+
|
| 148 |
+
def cache_results(func):
|
| 149 |
+
"""Decorator for caching function results"""
|
| 150 |
+
def wrapper(*args, **kwargs):
|
| 151 |
+
# Create cache key from function name and arguments
|
| 152 |
+
cache_key = f"{func.__name__}_{str(args)}_{str(kwargs)}"
|
| 153 |
+
|
| 154 |
+
# Try to get from cache
|
| 155 |
+
cached_result = cache_manager.get(cache_key)
|
| 156 |
+
if cached_result is not None:
|
| 157 |
+
return cached_result
|
| 158 |
+
|
| 159 |
+
# Execute function and cache result
|
| 160 |
+
result = func(*args, **kwargs)
|
| 161 |
+
cache_manager.set(cache_key, result)
|
| 162 |
+
|
| 163 |
+
return result
|
| 164 |
+
|
| 165 |
+
return wrapper
|
| 166 |
+
|
| 167 |
+
def validate_input(text: str, min_length: int = 10, max_length: int = 10000) -> bool:
|
| 168 |
+
"""Validate input text"""
|
| 169 |
+
if not text or not isinstance(text, str):
|
| 170 |
+
return False
|
| 171 |
+
|
| 172 |
+
text = text.strip()
|
| 173 |
+
if len(text) < min_length or len(text) > max_length:
|
| 174 |
+
return False
|
| 175 |
+
|
| 176 |
+
return True
|
| 177 |
+
|
| 178 |
+
def sanitize_filename(filename: str) -> str:
|
| 179 |
+
"""Sanitize filename for safe file system usage"""
|
| 180 |
+
import re
|
| 181 |
+
|
| 182 |
+
# Replace invalid characters
|
| 183 |
+
sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
| 184 |
+
|
| 185 |
+
# Remove extra spaces and dots
|
| 186 |
+
sanitized = re.sub(r'\s+', '_', sanitized)
|
| 187 |
+
sanitized = re.sub(r'\.+', '.', sanitized)
|
| 188 |
+
|
| 189 |
+
# Limit length
|
| 190 |
+
if len(sanitized) > 200:
|
| 191 |
+
sanitized = sanitized[:200]
|
| 192 |
+
|
| 193 |
+
return sanitized
|
| 194 |
+
|
| 195 |
+
def format_datetime(dt: datetime = None) -> str:
|
| 196 |
+
"""Format datetime for display"""
|
| 197 |
+
if dt is None:
|
| 198 |
+
dt = datetime.now()
|
| 199 |
+
|
| 200 |
+
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
| 201 |
+
|
| 202 |
+
def calculate_processing_stats(start_time: datetime, num_articles: int) -> Dict[str, Any]:
|
| 203 |
+
"""Calculate processing statistics"""
|
| 204 |
+
end_time = datetime.now()
|
| 205 |
+
processing_time = (end_time - start_time).total_seconds()
|
| 206 |
+
|
| 207 |
+
return {
|
| 208 |
+
'start_time': format_datetime(start_time),
|
| 209 |
+
'end_time': format_datetime(end_time),
|
| 210 |
+
'processing_time_seconds': processing_time,
|
| 211 |
+
'processing_time_formatted': f"{processing_time:.2f} seconds",
|
| 212 |
+
'articles_processed': num_articles,
|
| 213 |
+
'articles_per_second': round(num_articles / processing_time, 2) if processing_time > 0 else 0
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
|
| 217 |
+
"""Split text into overlapping chunks"""
|
| 218 |
+
if len(text) <= chunk_size:
|
| 219 |
+
return [text]
|
| 220 |
+
|
| 221 |
+
chunks = []
|
| 222 |
+
start = 0
|
| 223 |
+
|
| 224 |
+
while start < len(text):
|
| 225 |
+
end = start + chunk_size
|
| 226 |
+
|
| 227 |
+
# If this isn't the last chunk, try to break at a sentence boundary
|
| 228 |
+
if end < len(text):
|
| 229 |
+
# Look for sentence boundaries in the last 100 characters
|
| 230 |
+
last_part = text[end-100:end]
|
| 231 |
+
sentence_end = max(
|
| 232 |
+
last_part.rfind('.'),
|
| 233 |
+
last_part.rfind('!'),
|
| 234 |
+
last_part.rfind('?')
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
if sentence_end != -1:
|
| 238 |
+
end = end - 100 + sentence_end + 1
|
| 239 |
+
|
| 240 |
+
chunks.append(text[start:end].strip())
|
| 241 |
+
start = end - overlap
|
| 242 |
+
|
| 243 |
+
return [chunk for chunk in chunks if chunk.strip()]
|
| 244 |
+
|
| 245 |
+
def extract_domain(url: str) -> str:
|
| 246 |
+
"""Extract domain from URL"""
|
| 247 |
+
try:
|
| 248 |
+
from urllib.parse import urlparse
|
| 249 |
+
parsed = urlparse(url)
|
| 250 |
+
return parsed.netloc.replace('www.', '')
|
| 251 |
+
except Exception:
|
| 252 |
+
return 'unknown'
|
| 253 |
+
|
| 254 |
+
def safe_divide(a: float, b: float, default: float = 0.0) -> float:
|
| 255 |
+
"""Safely divide two numbers"""
|
| 256 |
+
try:
|
| 257 |
+
return a / b if b != 0 else default
|
| 258 |
+
except (TypeError, ZeroDivisionError):
|
| 259 |
+
return default
|
| 260 |
+
|
| 261 |
+
def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
|
| 262 |
+
"""Truncate text to specified length"""
|
| 263 |
+
if not text or len(text) <= max_length:
|
| 264 |
+
return text
|
| 265 |
+
|
| 266 |
+
return text[:max_length - len(suffix)] + suffix
|
| 267 |
+
|
| 268 |
+
def get_file_size_mb(filepath: str) -> float:
|
| 269 |
+
"""Get file size in MB"""
|
| 270 |
+
try:
|
| 271 |
+
size_bytes = os.path.getsize(filepath)
|
| 272 |
+
return round(size_bytes / (1024 * 1024), 2)
|
| 273 |
+
except Exception:
|
| 274 |
+
return 0.0
|
| 275 |
+
|
| 276 |
+
def ensure_directory(directory: str):
|
| 277 |
+
"""Ensure directory exists"""
|
| 278 |
+
try:
|
| 279 |
+
os.makedirs(directory, exist_ok=True)
|
| 280 |
+
except Exception as e:
|
| 281 |
+
logging.error(f"Failed to create directory {directory}: {str(e)}")
|
| 282 |
+
|
| 283 |
+
def load_json_file(filepath: str) -> Optional[Dict]:
|
| 284 |
+
"""Load JSON file safely"""
|
| 285 |
+
try:
|
| 286 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 287 |
+
return json.load(f)
|
| 288 |
+
except Exception as e:
|
| 289 |
+
logging.error(f"Failed to load JSON file {filepath}: {str(e)}")
|
| 290 |
+
return None
|
| 291 |
+
|
| 292 |
+
def save_json_file(data: Dict, filepath: str) -> bool:
|
| 293 |
+
"""Save data to JSON file safely"""
|
| 294 |
+
try:
|
| 295 |
+
ensure_directory(os.path.dirname(filepath))
|
| 296 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 297 |
+
json.dump(data, f, indent=2, default=str)
|
| 298 |
+
return True
|
| 299 |
+
except Exception as e:
|
| 300 |
+
logging.error(f"Failed to save JSON file {filepath}: {str(e)}")
|
| 301 |
+
return False
|
| 302 |
+
|
| 303 |
+
def merge_dictionaries(*dicts) -> Dict:
|
| 304 |
+
"""Merge multiple dictionaries"""
|
| 305 |
+
result = {}
|
| 306 |
+
for d in dicts:
|
| 307 |
+
if isinstance(d, dict):
|
| 308 |
+
result.update(d)
|
| 309 |
+
return result
|
| 310 |
+
|
| 311 |
+
def get_system_info() -> Dict[str, Any]:
|
| 312 |
+
"""Get basic system information"""
|
| 313 |
+
import platform
|
| 314 |
+
import psutil
|
| 315 |
+
|
| 316 |
+
try:
|
| 317 |
+
return {
|
| 318 |
+
'platform': platform.platform(),
|
| 319 |
+
'python_version': platform.python_version(),
|
| 320 |
+
'cpu_count': os.cpu_count(),
|
| 321 |
+
'memory_gb': round(psutil.virtual_memory().total / (1024**3), 2),
|
| 322 |
+
'available_memory_gb': round(psutil.virtual_memory().available / (1024**3), 2),
|
| 323 |
+
'disk_space_gb': round(psutil.disk_usage('/').total / (1024**3), 2)
|
| 324 |
+
}
|
| 325 |
+
except Exception as e:
|
| 326 |
+
logging.error(f"Failed to get system info: {str(e)}")
|
| 327 |
+
return {'error': str(e)}
|
| 328 |
+
|
| 329 |
+
def format_number(num: float, precision: int = 2) -> str:
|
| 330 |
+
"""Format number for display"""
|
| 331 |
+
try:
|
| 332 |
+
if abs(num) >= 1_000_000:
|
| 333 |
+
return f"{num / 1_000_000:.{precision}f}M"
|
| 334 |
+
elif abs(num) >= 1_000:
|
| 335 |
+
return f"{num / 1_000:.{precision}f}K"
|
| 336 |
+
else:
|
| 337 |
+
return f"{num:.{precision}f}"
|
| 338 |
+
except Exception:
|
| 339 |
+
return str(num)
|
| 340 |
+
|
| 341 |
+
def calculate_sentiment_distribution(articles: List[Dict]) -> Dict[str, Any]:
|
| 342 |
+
"""Calculate sentiment distribution statistics"""
|
| 343 |
+
try:
|
| 344 |
+
if not articles:
|
| 345 |
+
return {'positive': 0, 'negative': 0, 'neutral': 0, 'total': 0}
|
| 346 |
+
|
| 347 |
+
sentiments = []
|
| 348 |
+
for article in articles:
|
| 349 |
+
sentiment = article.get('sentiment', {})
|
| 350 |
+
compound = sentiment.get('compound', 0)
|
| 351 |
+
sentiments.append(compound)
|
| 352 |
+
|
| 353 |
+
positive_count = sum(1 for s in sentiments if s > 0.1)
|
| 354 |
+
negative_count = sum(1 for s in sentiments if s < -0.1)
|
| 355 |
+
neutral_count = len(sentiments) - positive_count - negative_count
|
| 356 |
+
|
| 357 |
+
avg_sentiment = sum(sentiments) / len(sentiments) if sentiments else 0
|
| 358 |
+
|
| 359 |
+
return {
|
| 360 |
+
'positive': positive_count,
|
| 361 |
+
'negative': negative_count,
|
| 362 |
+
'neutral': neutral_count,
|
| 363 |
+
'total': len(articles),
|
| 364 |
+
'average_sentiment': round(avg_sentiment, 3),
|
| 365 |
+
'positive_percentage': round((positive_count / len(articles)) * 100, 1),
|
| 366 |
+
'negative_percentage': round((negative_count / len(articles)) * 100, 1),
|
| 367 |
+
'neutral_percentage': round((neutral_count / len(articles)) * 100, 1)
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
except Exception as e:
|
| 371 |
+
logging.error(f"Sentiment distribution calculation failed: {str(e)}")
|
| 372 |
+
return {'positive': 0, 'negative': 0, 'neutral': 0, 'total': 0}
|
| 373 |
+
|
| 374 |
+
def create_progress_callback(progress_container=None):
|
| 375 |
+
"""Create a progress callback function for Streamlit"""
|
| 376 |
+
def callback(progress: int, status: str):
|
| 377 |
+
if progress_container:
|
| 378 |
+
try:
|
| 379 |
+
progress_container.progress(progress)
|
| 380 |
+
if hasattr(progress_container, 'text'):
|
| 381 |
+
progress_container.text(status)
|
| 382 |
+
except Exception as e:
|
| 383 |
+
logging.error(f"Progress callback error: {str(e)}")
|
| 384 |
+
else:
|
| 385 |
+
logging.info(f"Progress: {progress}% - {status}")
|
| 386 |
+
|
| 387 |
+
return callback
|
| 388 |
+
|
| 389 |
+
def validate_url(url: str) -> bool:
|
| 390 |
+
"""Validate if string is a valid URL"""
|
| 391 |
+
import re
|
| 392 |
+
|
| 393 |
+
url_pattern = re.compile(
|
| 394 |
+
r'^https?://' # http:// or https://
|
| 395 |
+
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
|
| 396 |
+
r'localhost|' # localhost...
|
| 397 |
+
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
| 398 |
+
r'(?::\d+)?' # optional port
|
| 399 |
+
r'(?:/?|[/?]\S+), re.IGNORECASE)
|
| 400 |
+
|
| 401 |
+
return url_pattern.match(url) is not None
|
| 402 |
+
|
| 403 |
+
class PerformanceTimer:
|
| 404 |
+
"""Context manager for timing operations"""
|
| 405 |
+
|
| 406 |
+
def __init__(self, operation_name: str = "Operation"):
|
| 407 |
+
self.operation_name = operation_name
|
| 408 |
+
self.start_time = None
|
| 409 |
+
self.end_time = None
|
| 410 |
+
|
| 411 |
+
def __enter__(self):
|
| 412 |
+
self.start_time = datetime.now()
|
| 413 |
+
logging.info(f"Starting {self.operation_name}")
|
| 414 |
+
return self
|
| 415 |
+
|
| 416 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 417 |
+
self.end_time = datetime.now()
|
| 418 |
+
duration = (self.end_time - self.start_time).total_seconds()
|
| 419 |
+
logging.info(f"Completed {self.operation_name} in {duration:.2f} seconds")
|
| 420 |
+
|
| 421 |
+
@property
|
| 422 |
+
def duration(self) -> float:
|
| 423 |
+
if self.start_time and self.end_time:
|
| 424 |
+
return (self.end_time - self.start_time).total_seconds()
|
| 425 |
+
return 0.0
|
| 426 |
+
|
| 427 |
+
def retry_operation(func, max_attempts: int = 3, delay: float = 1.0):
|
| 428 |
+
"""Retry an operation with exponential backoff"""
|
| 429 |
+
import time
|
| 430 |
+
|
| 431 |
+
for attempt in range(max_attempts):
|
| 432 |
+
try:
|
| 433 |
+
return func()
|
| 434 |
+
except Exception as e:
|
| 435 |
+
if attempt == max_attempts - 1:
|
| 436 |
+
raise e
|
| 437 |
+
|
| 438 |
+
wait_time = delay * (2 ** attempt)
|
| 439 |
+
logging.warning(f"Attempt {attempt + 1} failed: {str(e)}. Retrying in {wait_time} seconds...")
|
| 440 |
+
time.sleep(wait_time)
|
| 441 |
+
|
| 442 |
+
return None
|