wekey1998 commited on
Commit
8f8d0f6
·
verified ·
1 Parent(s): f29b750

Upload 13 files

Browse files
api_backend (1).py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from typing import List, Optional, Dict, Any
5
+ import asyncio
6
+ import logging
7
+ from datetime import datetime
8
+ import json
9
+
10
+ # Import our modules
11
+ from scraper import NewsletterScraper
12
+ from nlp import SentimentAnalyzer, KeywordExtractor
13
+ from summarizer import TextSummarizer
14
+ from translator import MultilingualTranslator
15
+ from tts import AudioGenerator
16
+ from utils import setup_logging, cache_results
17
+
18
+ # Setup logging
19
+ setup_logging()
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # FastAPI app
23
+ app = FastAPI(
24
+ title="Global Business News Intelligence API",
25
+ description="Advanced news analysis with sentiment, summarization, and multilingual support",
26
+ version="1.0.0"
27
+ )
28
+
29
+ # CORS middleware
30
+ app.add_middleware(
31
+ CORSMiddleware,
32
+ allow_origins=["*"],
33
+ allow_credentials=True,
34
+ allow_methods=["*"],
35
+ allow_headers=["*"],
36
+ )
37
+
38
+ class AnalysisRequest(BaseModel):
39
+ query: str
40
+ num_articles: int = 20
41
+ languages: List[str] = ["English"]
42
+ include_audio: bool = True
43
+ sentiment_models: List[str] = ["VADER", "Loughran-McDonald", "FinBERT"]
44
+
45
+ class AnalysisResponse(BaseModel):
46
+ query: str
47
+ total_articles: int
48
+ processing_time: float
49
+ average_sentiment: float
50
+ sentiment_distribution: Dict[str, int]
51
+ articles: List[Dict[str, Any]]
52
+ keywords: List[Dict[str, Any]]
53
+ summary: Dict[str, Any]
54
+ languages: List[str]
55
+ audio_files: Optional[Dict[str, str]] = None
56
+
57
+ class NewsAnalyzer:
58
+ """Main news analysis orchestrator"""
59
+
60
+ def __init__(self):
61
+ self.scraper = NewsletterScraper()
62
+ self.sentiment_analyzer = SentimentAnalyzer()
63
+ self.keyword_extractor = KeywordExtractor()
64
+ self.summarizer = TextSummarizer()
65
+ self.translator = MultilingualTranslator()
66
+ self.audio_generator = AudioGenerator()
67
+
68
+ logger.info("NewsAnalyzer initialized successfully")
69
+
70
+ async def analyze_news_async(self, config: Dict[str, Any], progress_callback=None) -> Dict[str, Any]:
71
+ """Async version of analyze_news"""
72
+ return self.analyze_news(config, progress_callback)
73
+
74
+ def analyze_news(self, config: Dict[str, Any], progress_callback=None) -> Dict[str, Any]:
75
+ """Main analysis pipeline"""
76
+ start_time = datetime.now()
77
+
78
+ try:
79
+ query = config['query']
80
+ num_articles = config.get('num_articles', 20)
81
+ languages = config.get('languages', ['English'])
82
+ include_audio = config.get('include_audio', True)
83
+ sentiment_models = config.get('sentiment_models', ['VADER', 'Loughran-McDonald', 'FinBERT'])
84
+
85
+ logger.info(f"Starting analysis for query: {query}")
86
+
87
+ if progress_callback:
88
+ progress_callback(10, "Scraping articles...")
89
+
90
+ # Step 1: Scrape articles
91
+ articles = self.scraper.scrape_news(query, num_articles)
92
+ logger.info(f"Scraped {len(articles)} articles")
93
+
94
+ if not articles:
95
+ raise ValueError("No articles found for the given query")
96
+
97
+ if progress_callback:
98
+ progress_callback(30, "Analyzing sentiment...")
99
+
100
+ # Step 2: Sentiment analysis
101
+ for article in articles:
102
+ article['sentiment'] = self.sentiment_analyzer.analyze_sentiment(
103
+ article['content'],
104
+ models=sentiment_models
105
+ )
106
+
107
+ if progress_callback:
108
+ progress_callback(50, "Extracting keywords...")
109
+
110
+ # Step 3: Keyword extraction
111
+ all_text = ' '.join([article['content'] for article in articles])
112
+ keywords = self.keyword_extractor.extract_keywords(all_text)
113
+
114
+ if progress_callback:
115
+ progress_callback(60, "Generating summaries...")
116
+
117
+ # Step 4: Summarization
118
+ for article in articles:
119
+ article['summary'] = self.summarizer.summarize(article['content'])
120
+
121
+ # Multilingual summaries
122
+ if len(languages) > 1:
123
+ article['summaries'] = {}
124
+ for lang in languages:
125
+ if lang != 'English':
126
+ article['summaries'][lang] = self.translator.translate(
127
+ article['summary'],
128
+ target_lang=lang
129
+ )
130
+ else:
131
+ article['summaries'][lang] = article['summary']
132
+
133
+ if progress_callback:
134
+ progress_callback(80, "Generating audio...")
135
+
136
+ # Step 5: Audio generation
137
+ audio_files = {}
138
+ if include_audio and languages:
139
+ # Create overall summary for audio
140
+ overall_summary = self.create_overall_summary(articles, keywords)
141
+
142
+ for lang in languages:
143
+ if lang in ['English', 'Hindi', 'Tamil']:
144
+ try:
145
+ if lang != 'English':
146
+ summary_text = self.translator.translate(overall_summary, target_lang=lang)
147
+ else:
148
+ summary_text = overall_summary
149
+
150
+ audio_file = self.audio_generator.generate_audio(
151
+ summary_text,
152
+ language=lang,
153
+ output_file=f"summary_{lang.lower()}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
154
+ )
155
+ audio_files[lang] = audio_file
156
+ except Exception as e:
157
+ logger.error(f"Error generating audio for {lang}: {str(e)}")
158
+
159
+ if progress_callback:
160
+ progress_callback(90, "Finalizing results...")
161
+
162
+ # Step 6: Calculate summary statistics
163
+ sentiments = [article['sentiment']['compound'] for article in articles]
164
+ average_sentiment = sum(sentiments) / len(sentiments) if sentiments else 0.0
165
+
166
+ sentiment_distribution = {
167
+ 'Positive': sum(1 for s in sentiments if s > 0.1),
168
+ 'Negative': sum(1 for s in sentiments if s < -0.1),
169
+ 'Neutral': sum(1 for s in sentiments if -0.1 <= s <= 0.1)
170
+ }
171
+
172
+ # Step 7: Prepare results
173
+ processing_time = (datetime.now() - start_time).total_seconds()
174
+
175
+ results = {
176
+ 'query': query,
177
+ 'total_articles': len(articles),
178
+ 'processing_time': processing_time,
179
+ 'average_sentiment': average_sentiment,
180
+ 'sentiment_distribution': sentiment_distribution,
181
+ 'articles': articles,
182
+ 'keywords': keywords,
183
+ 'languages': languages,
184
+ 'audio_files': audio_files,
185
+ 'summary': {
186
+ 'average_sentiment': average_sentiment,
187
+ 'total_articles': len(articles),
188
+ 'sources': len(set([article['source'] for article in articles])),
189
+ 'date_range': self.get_date_range(articles)
190
+ }
191
+ }
192
+
193
+ if progress_callback:
194
+ progress_callback(100, "Analysis complete!")
195
+
196
+ logger.info(f"Analysis completed successfully in {processing_time:.2f} seconds")
197
+ return results
198
+
199
+ except Exception as e:
200
+ logger.error(f"Error in analysis pipeline: {str(e)}")
201
+ raise e
202
+
203
+ def create_overall_summary(self, articles: List[Dict], keywords: List[Dict]) -> str:
204
+ """Create an overall summary for audio generation"""
205
+ try:
206
+ # Get top keywords
207
+ top_keywords = [kw['keyword'] for kw in keywords[:10]]
208
+
209
+ # Calculate sentiment distribution
210
+ positive_count = sum(1 for article in articles if article['sentiment']['compound'] > 0.1)
211
+ negative_count = sum(1 for article in articles if article['sentiment']['compound'] < -0.1)
212
+ neutral_count = len(articles) - positive_count - negative_count
213
+
214
+ # Create summary text
215
+ summary = f"Analysis of {len(articles)} articles reveals "
216
+
217
+ if positive_count > negative_count:
218
+ summary += f"predominantly positive sentiment with {positive_count} positive, {negative_count} negative, and {neutral_count} neutral articles. "
219
+ elif negative_count > positive_count:
220
+ summary += f"predominantly negative sentiment with {negative_count} negative, {positive_count} positive, and {neutral_count} neutral articles. "
221
+ else:
222
+ summary += f"mixed sentiment with balanced coverage. "
223
+
224
+ if top_keywords:
225
+ summary += f"Key topics include: {', '.join(top_keywords[:5])}. "
226
+
227
+ # Add top stories
228
+ top_positive = sorted(articles, key=lambda x: x['sentiment']['compound'], reverse=True)[:2]
229
+ top_negative = sorted(articles, key=lambda x: x['sentiment']['compound'])[:2]
230
+
231
+ if top_positive[0]['sentiment']['compound'] > 0.1:
232
+ summary += f"Most positive coverage: {top_positive[0]['title'][:100]}. "
233
+
234
+ if top_negative[0]['sentiment']['compound'] < -0.1:
235
+ summary += f"Most concerning coverage: {top_negative[0]['title'][:100]}. "
236
+
237
+ return summary
238
+
239
+ except Exception as e:
240
+ logger.error(f"Error creating overall summary: {str(e)}")
241
+ return f"Analysis of {len(articles)} articles completed successfully."
242
+
243
+ def get_date_range(self, articles: List[Dict]) -> Dict[str, str]:
244
+ """Get the date range of articles"""
245
+ try:
246
+ dates = [article['date'] for article in articles if 'date' in article and article['date']]
247
+ if dates:
248
+ dates = [d for d in dates if d is not None]
249
+ if dates:
250
+ min_date = min(dates)
251
+ max_date = max(dates)
252
+ return {
253
+ 'start': str(min_date),
254
+ 'end': str(max_date)
255
+ }
256
+ return {'start': 'Unknown', 'end': 'Unknown'}
257
+ except Exception as e:
258
+ logger.error(f"Error getting date range: {str(e)}")
259
+ return {'start': 'Unknown', 'end': 'Unknown'}
260
+
261
+ # Initialize the analyzer
262
+ analyzer = NewsAnalyzer()
263
+
264
+ # FastAPI endpoints
265
+ @app.get("/", response_model=Dict[str, str])
266
+ async def root():
267
+ """API root endpoint"""
268
+ return {
269
+ "message": "Global Business News Intelligence API",
270
+ "version": "1.0.0",
271
+ "docs": "/docs"
272
+ }
273
+
274
+ @app.get("/health", response_model=Dict[str, str])
275
+ async def health_check():
276
+ """Health check endpoint"""
277
+ return {"status": "healthy", "timestamp": datetime.now().isoformat()}
278
+
279
+ @app.get("/api/analyze", response_model=AnalysisResponse)
280
+ async def analyze_news_endpoint(
281
+ query: str = Query(..., description="Company name, ticker, or keyword to analyze"),
282
+ num_articles: int = Query(20, description="Number of articles to analyze (5-50)", ge=5, le=50),
283
+ languages: List[str] = Query(["English"], description="Languages for summaries"),
284
+ include_audio: bool = Query(True, description="Generate audio summaries"),
285
+ sentiment_models: List[str] = Query(["VADER", "Loughran-McDonald", "FinBERT"], description="Sentiment models to use")
286
+ ):
287
+ """Main analysis endpoint"""
288
+ try:
289
+ config = {
290
+ 'query': query,
291
+ 'num_articles': num_articles,
292
+ 'languages': languages,
293
+ 'include_audio': include_audio,
294
+ 'sentiment_models': sentiment_models
295
+ }
296
+
297
+ results = await analyzer.analyze_news_async(config)
298
+
299
+ return AnalysisResponse(**results)
300
+
301
+ except Exception as e:
302
+ logger.error(f"Error in analyze endpoint: {str(e)}")
303
+ raise HTTPException(status_code=500, detail=str(e))
304
+
305
+ @app.post("/api/analyze", response_model=AnalysisResponse)
306
+ async def analyze_news_post(request: AnalysisRequest):
307
+ """POST version of analysis endpoint"""
308
+ try:
309
+ config = request.dict()
310
+ results = await analyzer.analyze_news_async(config)
311
+ return AnalysisResponse(**results)
312
+
313
+ except Exception as e:
314
+ logger.error(f"Error in analyze POST endpoint: {str(e)}")
315
+ raise HTTPException(status_code=500, detail=str(e))
316
+
317
+ @app.get("/api/sources", response_model=List[str])
318
+ async def get_available_sources():
319
+ """Get list of available news sources"""
320
+ return analyzer.scraper.get_available_sources()
321
+
322
+ @app.get("/api/models", response_model=Dict[str, List[str]])
323
+ async def get_available_models():
324
+ """Get list of available models"""
325
+ return {
326
+ "sentiment_models": ["VADER", "Loughran-McDonald", "FinBERT"],
327
+ "summarization_models": ["distilbart-cnn-12-6"],
328
+ "translation_models": ["Helsinki-NLP/opus-mt-en-hi", "Helsinki-NLP/opus-mt-en-fi"],
329
+ "audio_languages": ["English", "Hindi", "Tamil"]
330
+ }
331
+
332
+ @app.get("/api/keywords/{query}", response_model=List[Dict[str, Any]])
333
+ async def extract_keywords_endpoint(
334
+ query: str,
335
+ num_keywords: int = Query(20, description="Number of keywords to extract", ge=5, le=50)
336
+ ):
337
+ """Extract keywords from a query or text"""
338
+ try:
339
+ # For demo purposes, we'll scrape a few articles and extract keywords
340
+ articles = analyzer.scraper.scrape_news(query, 5)
341
+ if not articles:
342
+ raise HTTPException(status_code=404, detail="No articles found for query")
343
+
344
+ all_text = ' '.join([article['content'] for article in articles])
345
+ keywords = analyzer.keyword_extractor.extract_keywords(all_text, num_keywords=num_keywords)
346
+
347
+ return keywords
348
+
349
+ except Exception as e:
350
+ logger.error(f"Error in keywords endpoint: {str(e)}")
351
+ raise HTTPException(status_code=500, detail=str(e))
352
+
353
+ if __name__ == "__main__":
354
+ import uvicorn
355
+ uvicorn.run(app, host="0.0.0.0", port=8000)
config_json.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "max_articles": 50,
3
+ "cache_ttl_hours": 6,
4
+ "supported_languages": ["English", "Hindi", "Tamil"],
5
+ "sentiment_models": ["VADER", "Loughran-McDonald", "FinBERT"],
6
+ "summarization_max_length": 150,
7
+ "summarization_min_length": 50,
8
+ "audio_enabled": true,
9
+ "translation_enabled": true,
10
+ "keyword_extraction_enabled": true,
11
+ "max_keywords": 20,
12
+ "debug_mode": false,
13
+ "huggingface_space_config": {
14
+ "title": "Global Business News Intelligence Dashboard",
15
+ "emoji": "📊",
16
+ "colorFrom": "blue",
17
+ "colorTo": "green",
18
+ "sdk": "streamlit",
19
+ "sdk_version": "1.28.1",
20
+ "app_file": "app.py",
21
+ "pinned": false,
22
+ "license": "mit"
23
+ },
24
+ "api_config": {
25
+ "host": "0.0.0.0",
26
+ "port": 8000,
27
+ "reload": false,
28
+ "workers": 1
29
+ },
30
+ "performance_settings": {
31
+ "max_concurrent_requests": 10,
32
+ "request_timeout_seconds": 300,
33
+ "memory_limit_gb": 4,
34
+ "cpu_optimization": true
35
+ },
36
+ "news_sources": {
37
+ "google_news": true,
38
+ "reuters": true,
39
+ "bbc": true,
40
+ "cnbc": true,
41
+ "bloomberg": true,
42
+ "marketwatch": true,
43
+ "financial_times": false
44
+ },
45
+ "model_settings": {
46
+ "use_cpu_only": true,
47
+ "model_cache_dir": "./model_cache",
48
+ "download_models_on_startup": false,
49
+ "optimize_for_inference": true
50
+ }
51
+ }
dockerfile.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python runtime as base image
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Set environment variables
8
+ ENV PYTHONUNBUFFERED=1
9
+ ENV PYTHONDONTWRITEBYTECODE=1
10
+ ENV STREAMLIT_SERVER_HEADLESS=true
11
+ ENV STREAMLIT_SERVER_PORT=7860
12
+ ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
13
+ ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
14
+
15
+ # Install system dependencies
16
+ RUN apt-get update && apt-get install -y \
17
+ build-essential \
18
+ curl \
19
+ software-properties-common \
20
+ git \
21
+ && rm -rf /var/lib/apt/lists/*
22
+
23
+ # Copy requirements first for better caching
24
+ COPY requirements.txt .
25
+
26
+ # Install Python dependencies
27
+ RUN pip install --no-cache-dir --upgrade pip && \
28
+ pip install --no-cache-dir -r requirements.txt
29
+
30
+ # Download NLTK data
31
+ RUN python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('punkt'); nltk.download('stopwords')"
32
+
33
+ # Copy application code
34
+ COPY . .
35
+
36
+ # Create necessary directories
37
+ RUN mkdir -p logs cache model_cache temp
38
+
39
+ # Expose port
40
+ EXPOSE 7860
41
+
42
+ # Health check
43
+ HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
44
+
45
+ # Run application
46
+ CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
gitattributes_file.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
nlp_module (1).py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import logging
4
+ from typing import Dict, List, Any, Optional
5
+ import pandas as pd
6
+ import numpy as np
7
+ from collections import Counter
8
+
9
+ # NLTK imports
10
+ import nltk
11
+ try:
12
+ from nltk.sentiment import SentimentIntensityAnalyzer
13
+ from nltk.corpus import stopwords
14
+ from nltk.tokenize import word_tokenize, sent_tokenize
15
+ from nltk.stem import PorterStemmer
16
+ except ImportError:
17
+ pass
18
+
19
+ # Download required NLTK data
20
+ try:
21
+ nltk.download('vader_lexicon', quiet=True)
22
+ nltk.download('punkt', quiet=True)
23
+ nltk.download('stopwords', quiet=True)
24
+ except:
25
+ pass
26
+
27
+ # Transformers for FinBERT
28
+ try:
29
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
30
+ import torch
31
+ except ImportError:
32
+ pass
33
+
34
+ # YAKE for keyword extraction
35
+ try:
36
+ import yake
37
+ except ImportError:
38
+ pass
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ class SentimentAnalyzer:
43
+ """Multi-model sentiment analysis"""
44
+
45
+ def __init__(self):
46
+ self.vader_analyzer = None
47
+ self.finbert_pipeline = None
48
+ self.loughran_mcdonald_dict = None
49
+
50
+ self._initialize_models()
51
+ logger.info("SentimentAnalyzer initialized")
52
+
53
+ def _initialize_models(self):
54
+ """Initialize all sentiment analysis models"""
55
+ # VADER
56
+ try:
57
+ self.vader_analyzer = SentimentIntensityAnalyzer()
58
+ logger.info("VADER model loaded")
59
+ except Exception as e:
60
+ logger.error(f"Failed to load VADER: {str(e)}")
61
+
62
+ # FinBERT
63
+ try:
64
+ model_name = "ProsusAI/finbert"
65
+ self.finbert_pipeline = pipeline(
66
+ "sentiment-analysis",
67
+ model=model_name,
68
+ tokenizer=model_name,
69
+ device=0 if torch.cuda.is_available() else -1
70
+ )
71
+ logger.info("FinBERT model loaded")
72
+ except Exception as e:
73
+ logger.warning(f"Failed to load FinBERT, using CPU fallback: {str(e)}")
74
+ try:
75
+ model_name = "ProsusAI/finbert"
76
+ self.finbert_pipeline = pipeline(
77
+ "sentiment-analysis",
78
+ model=model_name,
79
+ tokenizer=model_name,
80
+ device=-1
81
+ )
82
+ logger.info("FinBERT model loaded on CPU")
83
+ except Exception as e2:
84
+ logger.error(f"Failed to load FinBERT completely: {str(e2)}")
85
+
86
+ # Loughran-McDonald Dictionary
87
+ try:
88
+ self.loughran_mcdonald_dict = self._load_loughran_mcdonald()
89
+ logger.info("Loughran-McDonald dictionary loaded")
90
+ except Exception as e:
91
+ logger.error(f"Failed to load Loughran-McDonald dictionary: {str(e)}")
92
+
93
+ def _load_loughran_mcdonald(self) -> Dict[str, List[str]]:
94
+ """Load Loughran-McDonald financial sentiment dictionary"""
95
+ # Simplified version with key financial sentiment words
96
+ return {
97
+ 'positive': [
98
+ 'profit', 'profitable', 'profitability', 'revenue', 'revenues', 'growth',
99
+ 'growing', 'increase', 'increased', 'increasing', 'success', 'successful',
100
+ 'gain', 'gains', 'benefit', 'benefits', 'improvement', 'improved', 'strong',
101
+ 'stronger', 'excellent', 'outstanding', 'exceed', 'exceeded', 'exceeds',
102
+ 'beat', 'beats', 'positive', 'optimistic', 'bullish', 'rise', 'rising',
103
+ 'surge', 'surged', 'boom', 'booming', 'expand', 'expansion', 'opportunity',
104
+ 'opportunities', 'advance', 'advances', 'achievement', 'achieve', 'winner'
105
+ ],
106
+ 'negative': [
107
+ 'loss', 'losses', 'lose', 'losing', 'decline', 'declining', 'decrease',
108
+ 'decreased', 'decreasing', 'fall', 'falling', 'drop', 'dropped', 'plunge',
109
+ 'plunged', 'crash', 'crashed', 'failure', 'failed', 'weak', 'weakness',
110
+ 'poor', 'worse', 'worst', 'bad', 'terrible', 'crisis', 'problem', 'problems',
111
+ 'risk', 'risks', 'risky', 'concern', 'concerns', 'worried', 'worry',
112
+ 'negative', 'pessimistic', 'bearish', 'bankruptcy', 'bankrupt', 'deficit',
113
+ 'debt', 'lawsuit', 'sue', 'sued', 'investigation', 'fraud', 'scandal',
114
+ 'volatility', 'volatile', 'uncertainty', 'uncertain', 'challenge', 'challenges'
115
+ ]
116
+ }
117
+
118
+ def analyze_sentiment(self, text: str, models: List[str] = None) -> Dict[str, Any]:
119
+ """Analyze sentiment using multiple models"""
120
+ if models is None:
121
+ models = ['VADER', 'Loughran-McDonald', 'FinBERT']
122
+
123
+ results = {}
124
+
125
+ # Clean text
126
+ cleaned_text = self._clean_text(text)
127
+
128
+ # VADER Analysis
129
+ if 'VADER' in models and self.vader_analyzer:
130
+ try:
131
+ vader_scores = self.vader_analyzer.polarity_scores(cleaned_text)
132
+ results['vader'] = vader_scores['compound']
133
+ results['vader_detailed'] = vader_scores
134
+ except Exception as e:
135
+ logger.error(f"VADER analysis failed: {str(e)}")
136
+ results['vader'] = 0.0
137
+
138
+ # Loughran-McDonald Analysis
139
+ if 'Loughran-McDonald' in models and self.loughran_mcdonald_dict:
140
+ try:
141
+ lm_score = self._analyze_loughran_mcdonald(cleaned_text)
142
+ results['loughran_mcdonald'] = lm_score
143
+ except Exception as e:
144
+ logger.error(f"Loughran-McDonald analysis failed: {str(e)}")
145
+ results['loughran_mcdonald'] = 0.0
146
+
147
+ # FinBERT Analysis
148
+ if 'FinBERT' in models and self.finbert_pipeline:
149
+ try:
150
+ # Truncate text for FinBERT (max 512 tokens)
151
+ truncated_text = cleaned_text[:2000] # Approximate token limit
152
+ finbert_result = self.finbert_pipeline(truncated_text)[0]
153
+
154
+ # Convert to numerical score
155
+ label = finbert_result['label'].lower()
156
+ confidence = finbert_result['score']
157
+
158
+ if label == 'positive':
159
+ finbert_score = confidence
160
+ elif label == 'negative':
161
+ finbert_score = -confidence
162
+ else: # neutral
163
+ finbert_score = 0.0
164
+
165
+ results['finbert'] = finbert_score
166
+ results['finbert_detailed'] = finbert_result
167
+
168
+ except Exception as e:
169
+ logger.error(f"FinBERT analysis failed: {str(e)}")
170
+ results['finbert'] = 0.0
171
+
172
+ # Calculate composite score
173
+ scores = []
174
+ weights = {'vader': 0.3, 'loughran_mcdonald': 0.4, 'finbert': 0.3}
175
+
176
+ for model in ['vader', 'loughran_mcdonald', 'finbert']:
177
+ if model in results:
178
+ scores.append(results[model] * weights[model])
179
+
180
+ results['compound'] = sum(scores) if scores else 0.0
181
+
182
+ return results
183
+
184
+ def _clean_text(self, text: str) -> str:
185
+ """Clean text for sentiment analysis"""
186
+ if not text:
187
+ return ""
188
+
189
+ # Remove URLs
190
+ text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
191
+
192
+ # Remove email addresses
193
+ text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
194
+
195
+ # Remove extra whitespace
196
+ text = re.sub(r'\s+', ' ', text)
197
+
198
+ # Remove special characters but keep basic punctuation
199
+ text = re.sub(r'[^\w\s.,!?;:\-\'"()]', '', text)
200
+
201
+ return text.strip()
202
+
203
+ def _analyze_loughran_mcdonald(self, text: str) -> float:
204
+ """Analyze sentiment using Loughran-McDonald dictionary"""
205
+ try:
206
+ words = word_tokenize(text.lower())
207
+
208
+ positive_count = sum(1 for word in words if word in self.loughran_mcdonald_dict['positive'])
209
+ negative_count = sum(1 for word in words if word in self.loughran_mcdonald_dict['negative'])
210
+
211
+ total_sentiment_words = positive_count + negative_count
212
+
213
+ if total_sentiment_words == 0:
214
+ return 0.0
215
+
216
+ # Calculate normalized score
217
+ score = (positive_count - negative_count) / len(words) * 10 # Scale factor
218
+
219
+ # Clamp to [-1, 1] range
220
+ return max(-1.0, min(1.0, score))
221
+
222
+ except Exception as e:
223
+ logger.error(f"Loughran-McDonald calculation error: {str(e)}")
224
+ return 0.0
225
+
226
+ class KeywordExtractor:
227
+ """Extract important keywords from text using YAKE"""
228
+
229
+ def __init__(self):
230
+ self.stop_words = set()
231
+ try:
232
+ self.stop_words = set(stopwords.words('english'))
233
+ except:
234
+ # Fallback stop words
235
+ self.stop_words = {
236
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
237
+ 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have',
238
+ 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
239
+ 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'
240
+ }
241
+
242
+ logger.info("KeywordExtractor initialized")
243
+
244
+ def extract_keywords(self, text: str, num_keywords: int = 20) -> List[Dict[str, Any]]:
245
+ """Extract keywords using YAKE algorithm"""
246
+ try:
247
+ # Use YAKE if available
248
+ if 'yake' in globals():
249
+ return self._extract_with_yake(text, num_keywords)
250
+ else:
251
+ return self._extract_with_frequency(text, num_keywords)
252
+
253
+ except Exception as e:
254
+ logger.error(f"Keyword extraction failed: {str(e)}")
255
+ return []
256
+
257
+ def _extract_with_yake(self, text: str, num_keywords: int) -> List[Dict[str, Any]]:
258
+ """Extract keywords using YAKE algorithm"""
259
+ try:
260
+ # YAKE configuration
261
+ kw_extractor = yake.KeywordExtractor(
262
+ lan="en",
263
+ n=3, # n-gram size
264
+ dedupLim=0.9,
265
+ top=num_keywords,
266
+ features=None
267
+ )
268
+
269
+ keywords = kw_extractor.extract_keywords(text)
270
+
271
+ # Convert to desired format (lower score = more relevant in YAKE)
272
+ result = []
273
+ for keyword, score in keywords:
274
+ result.append({
275
+ 'keyword': keyword,
276
+ 'score': 1.0 / (1.0 + score), # Invert score so higher = more relevant
277
+ 'relevance': 'high' if score < 0.1 else 'medium' if score < 0.3 else 'low'
278
+ })
279
+
280
+ return result
281
+
282
+ except Exception as e:
283
+ logger.error(f"YAKE extraction failed: {str(e)}")
284
+ return self._extract_with_frequency(text, num_keywords)
285
+
286
+ def _extract_with_frequency(self, text: str, num_keywords: int) -> List[Dict[str, Any]]:
287
+ """Fallback keyword extraction using frequency analysis"""
288
+ try:
289
+ # Clean and tokenize
290
+ words = word_tokenize(text.lower())
291
+
292
+ # Filter words
293
+ filtered_words = [
294
+ word for word in words
295
+ if (word not in self.stop_words and
296
+ word not in string.punctuation and
297
+ len(word) > 2 and
298
+ word.isalpha())
299
+ ]
300
+
301
+ # Count frequencies
302
+ word_freq = Counter(filtered_words)
303
+
304
+ # Get top keywords
305
+ top_words = word_freq.most_common(num_keywords)
306
+
307
+ # Calculate relevance scores
308
+ max_freq = top_words[0][1] if top_words else 1
309
+
310
+ result = []
311
+ for word, freq in top_words:
312
+ score = freq / max_freq
313
+ result.append({
314
+ 'keyword': word,
315
+ 'score': score,
316
+ 'relevance': 'high' if score > 0.7 else 'medium' if score > 0.3 else 'low'
317
+ })
318
+
319
+ return result
320
+
321
+ except Exception as e:
322
+ logger.error(f"Frequency extraction failed: {str(e)}")
323
+ return []
324
+
325
+ class TextProcessor:
326
+ """Text preprocessing and cleaning utilities"""
327
+
328
+ def __init__(self):
329
+ self.stemmer = PorterStemmer()
330
+ logger.info("TextProcessor initialized")
331
+
332
+ def clean_article_content(self, content: str) -> str:
333
+ """Clean article content by removing boilerplate"""
334
+ if not content:
335
+ return ""
336
+
337
+ # Remove common boilerplate patterns
338
+ boilerplate_patterns = [
339
+ r'Subscribe to our newsletter.*',
340
+ r'Sign up for.*',
341
+ r'Follow us on.*',
342
+ r'Copyright.*',
343
+ r'All rights reserved.*',
344
+ r'Terms of use.*',
345
+ r'Privacy policy.*',
346
+ r'Cookie policy.*',
347
+ r'\d+ comments?',
348
+ r'Share this article.*',
349
+ r'Related articles?.*',
350
+ r'More from.*',
351
+ r'Advertisement.*',
352
+ r'Sponsored content.*'
353
+ ]
354
+
355
+ cleaned_content = content
356
+ for pattern in boilerplate_patterns:
357
+ cleaned_content = re.sub(pattern, '', cleaned_content, flags=re.IGNORECASE)
358
+
359
+ # Remove extra whitespace
360
+ cleaned_content = re.sub(r'\s+', ' ', cleaned_content)
361
+
362
+ # Remove very short sentences (likely navigation/boilerplate)
363
+ sentences = sent_tokenize(cleaned_content)
364
+ meaningful_sentences = [
365
+ sent for sent in sentences
366
+ if len(sent.split()) > 5 and not self._is_boilerplate_sentence(sent)
367
+ ]
368
+
369
+ return ' '.join(meaningful_sentences).strip()
370
+
371
+ def _is_boilerplate_sentence(self, sentence: str) -> bool:
372
+ """Check if sentence is likely boilerplate"""
373
+ boilerplate_indicators = [
374
+ 'click here', 'read more', 'subscribe', 'follow us', 'contact us',
375
+ 'terms of service', 'privacy policy', 'copyright', 'all rights reserved',
376
+ 'advertisement', 'sponsored', 'related articles'
377
+ ]
378
+
379
+ sentence_lower = sentence.lower()
380
+ return any(indicator in sentence_lower for indicator in boilerplate_indicators)
381
+
382
+ def extract_entities(self, text: str) -> Dict[str, List[str]]:
383
+ """Extract named entities (companies, people, locations)"""
384
+ # Simple regex-based entity extraction
385
+ entities = {
386
+ 'companies': [],
387
+ 'people': [],
388
+ 'locations': [],
389
+ 'money': [],
390
+ 'dates': []
391
+ }
392
+
393
+ try:
394
+ # Company patterns (simplified)
395
+ company_pattern = r'\b[A-Z][a-zA-Z]+ (?:Inc|Corp|LLC|Ltd|Company|Co)\b'
396
+ entities['companies'] = list(set(re.findall(company_pattern, text)))
397
+
398
+ # Money patterns
399
+ money_pattern = r'\$[\d,]+(?:\.\d{2})?(?:\s?(?:million|billion|trillion|k|M|B|T))?'
400
+ entities['money'] = list(set(re.findall(money_pattern, text)))
401
+
402
+ # Date patterns (simplified)
403
+ date_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}'
404
+ entities['dates'] = list(set(re.findall(date_pattern, text)))
405
+
406
+ except Exception as e:
407
+ logger.error(f"Entity extraction failed: {str(e)}")
408
+
409
+ return entities
410
+
411
+ def calculate_readability(self, text: str) -> Dict[str, float]:
412
+ """Calculate text readability metrics"""
413
+ try:
414
+ sentences = sent_tokenize(text)
415
+ words = word_tokenize(text)
416
+
417
+ if not sentences or not words:
418
+ return {'flesch_score': 0.0, 'avg_sentence_length': 0.0, 'avg_word_length': 0.0}
419
+
420
+ # Basic metrics
421
+ num_sentences = len(sentences)
422
+ num_words = len(words)
423
+ num_syllables = sum(self._count_syllables(word) for word in words if word.isalpha())
424
+
425
+ # Average sentence length
426
+ avg_sentence_length = num_words / num_sentences
427
+
428
+ # Average word length
429
+ avg_word_length = sum(len(word) for word in words if word.isalpha()) / num_words
430
+
431
+ # Flesch Reading Ease Score (simplified)
432
+ flesch_score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * (num_syllables / num_words))
433
+
434
+ return {
435
+ 'flesch_score': max(0.0, min(100.0, flesch_score)),
436
+ 'avg_sentence_length': avg_sentence_length,
437
+ 'avg_word_length': avg_word_length
438
+ }
439
+
440
+ except Exception as e:
441
+ logger.error(f"Readability calculation failed: {str(e)}")
442
+ return {'flesch_score': 0.0, 'avg_sentence_length': 0.0, 'avg_word_length': 0.0}
443
+
444
+ def _count_syllables(self, word: str) -> int:
445
+ """Count syllables in a word (simplified)"""
446
+ word = word.lower()
447
+ vowels = 'aeiouy'
448
+ syllable_count = 0
449
+ prev_char_was_vowel = False
450
+
451
+ for char in word:
452
+ if char in vowels:
453
+ if not prev_char_was_vowel:
454
+ syllable_count += 1
455
+ prev_char_was_vowel = True
456
+ else:
457
+ prev_char_was_vowel = False
458
+
459
+ # Handle silent e
460
+ if word.endswith('e'):
461
+ syllable_count -= 1
462
+
463
+ # Every word has at least one syllable
464
+ return max(1, syllable_count)
report_module (1).py ADDED
@@ -0,0 +1,606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, List, Any, Optional
3
+ import io
4
+ from datetime import datetime
5
+ import base64
6
+
7
+ # PDF generation
8
+ try:
9
+ from reportlab.lib.pagesizes import letter, A4
10
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image
11
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
12
+ from reportlab.lib.units import inch
13
+ from reportlab.lib import colors
14
+ from reportlab.graphics.shapes import Drawing
15
+ from reportlab.graphics.charts.piecharts import Pie
16
+ from reportlab.graphics.charts.barcharts import VerticalBarChart
17
+ REPORTLAB_AVAILABLE = True
18
+ except ImportError:
19
+ REPORTLAB_AVAILABLE = False
20
+
21
+ # Plotting for charts in PDF
22
+ try:
23
+ import matplotlib.pyplot as plt
24
+ import matplotlib
25
+ matplotlib.use('Agg') # Use non-interactive backend
26
+ MATPLOTLIB_AVAILABLE = True
27
+ except ImportError:
28
+ MATPLOTLIB_AVAILABLE = False
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ def generate_pdf_report(results: Dict[str, Any]) -> io.BytesIO:
33
+ """Generate a comprehensive PDF report"""
34
+ if not REPORTLAB_AVAILABLE:
35
+ logger.error("ReportLab not available for PDF generation")
36
+ return _generate_simple_pdf_fallback(results)
37
+
38
+ try:
39
+ # Create PDF buffer
40
+ buffer = io.BytesIO()
41
+
42
+ # Create document
43
+ doc = SimpleDocTemplate(
44
+ buffer,
45
+ pagesize=A4,
46
+ rightMargin=72,
47
+ leftMargin=72,
48
+ topMargin=72,
49
+ bottomMargin=18
50
+ )
51
+
52
+ # Get styles
53
+ styles = getSampleStyleSheet()
54
+
55
+ # Create custom styles
56
+ title_style = ParagraphStyle(
57
+ 'CustomTitle',
58
+ parent=styles['Heading1'],
59
+ fontSize=24,
60
+ spaceAfter=30,
61
+ textColor=colors.HexColor('#2E86AB'),
62
+ alignment=1 # Center
63
+ )
64
+
65
+ heading_style = ParagraphStyle(
66
+ 'CustomHeading',
67
+ parent=styles['Heading2'],
68
+ fontSize=16,
69
+ spaceAfter=12,
70
+ spaceBefore=20,
71
+ textColor=colors.HexColor('#2E86AB')
72
+ )
73
+
74
+ # Build story (content)
75
+ story = []
76
+
77
+ # Title page
78
+ story.append(Paragraph("Global Business News Intelligence Report", title_style))
79
+ story.append(Spacer(1, 0.5*inch))
80
+
81
+ # Query and basic info
82
+ story.append(Paragraph(f"Analysis Target: {results.get('query', 'N/A')}", styles['Normal']))
83
+ story.append(Paragraph(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles['Normal']))
84
+ story.append(Paragraph(f"Total Articles Analyzed: {results.get('total_articles', 0)}", styles['Normal']))
85
+ story.append(Paragraph(f"Processing Time: {results.get('processing_time', 0):.2f} seconds", styles['Normal']))
86
+ story.append(Spacer(1, 0.3*inch))
87
+
88
+ # Executive Summary
89
+ story.append(Paragraph("Executive Summary", heading_style))
90
+ summary_text = _create_executive_summary(results)
91
+ story.append(Paragraph(summary_text, styles['Normal']))
92
+ story.append(Spacer(1, 0.2*inch))
93
+
94
+ # Sentiment Analysis Section
95
+ story.append(Paragraph("Sentiment Analysis", heading_style))
96
+ sentiment_data = _create_sentiment_section(results, styles)
97
+ story.extend(sentiment_data)
98
+
99
+ # Top Stories Section
100
+ story.append(Paragraph("Key Stories", heading_style))
101
+ stories_data = _create_stories_section(results, styles)
102
+ story.extend(stories_data)
103
+
104
+ # Keywords Section
105
+ if 'keywords' in results and results['keywords']:
106
+ story.append(Paragraph("Key Topics and Themes", heading_style))
107
+ keywords_data = _create_keywords_section(results, styles)
108
+ story.extend(keywords_data)
109
+
110
+ # Sources Section
111
+ story.append(Paragraph("News Sources", heading_style))
112
+ sources_data = _create_sources_section(results, styles)
113
+ story.extend(sources_data)
114
+
115
+ # Methodology Section
116
+ story.append(Paragraph("Methodology", heading_style))
117
+ methodology_text = _create_methodology_section(results)
118
+ story.append(Paragraph(methodology_text, styles['Normal']))
119
+
120
+ # Build PDF
121
+ doc.build(story)
122
+
123
+ buffer.seek(0)
124
+ return buffer
125
+
126
+ except Exception as e:
127
+ logger.error(f"PDF generation failed: {str(e)}")
128
+ return _generate_simple_pdf_fallback(results)
129
+
130
+ def _create_executive_summary(results: Dict[str, Any]) -> str:
131
+ """Create executive summary text"""
132
+ try:
133
+ query = results.get('query', 'the analyzed topic')
134
+ total_articles = results.get('total_articles', 0)
135
+ avg_sentiment = results.get('average_sentiment', 0)
136
+
137
+ sentiment_label = "positive" if avg_sentiment > 0.1 else "negative" if avg_sentiment < -0.1 else "neutral"
138
+
139
+ summary = f"This report analyzes {total_articles} news articles related to {query}. "
140
+ summary += f"The overall sentiment analysis reveals a {sentiment_label} tone with an average sentiment score of {avg_sentiment:.3f}. "
141
+
142
+ # Add sentiment distribution
143
+ dist = results.get('sentiment_distribution', {})
144
+ positive = dist.get('Positive', 0)
145
+ negative = dist.get('Negative', 0)
146
+ neutral = dist.get('Neutral', 0)
147
+
148
+ summary += f"The analysis shows {positive} positive articles ({positive/total_articles*100:.1f}%), "
149
+ summary += f"{negative} negative articles ({negative/total_articles*100:.1f}%), "
150
+ summary += f"and {neutral} neutral articles ({neutral/total_articles*100:.1f}%). "
151
+
152
+ # Add key insights
153
+ if avg_sentiment > 0.2:
154
+ summary += "The predominantly positive coverage suggests favorable market conditions or public perception."
155
+ elif avg_sentiment < -0.2:
156
+ summary += "The predominantly negative coverage indicates concerns or challenges that may require attention."
157
+ else:
158
+ summary += "The balanced sentiment coverage suggests a mixed outlook with both opportunities and challenges present."
159
+
160
+ return summary
161
+
162
+ except Exception as e:
163
+ logger.error(f"Executive summary creation failed: {str(e)}")
164
+ return "Analysis completed successfully with comprehensive sentiment evaluation across multiple news sources."
165
+
166
+ def _create_sentiment_section(results: Dict[str, Any], styles) -> List:
167
+ """Create sentiment analysis section"""
168
+ story = []
169
+
170
+ try:
171
+ # Sentiment distribution table
172
+ dist = results.get('sentiment_distribution', {})
173
+ sentiment_data = [
174
+ ['Sentiment', 'Count', 'Percentage'],
175
+ ['Positive', str(dist.get('Positive', 0)), f"{dist.get('Positive', 0)/results.get('total_articles', 1)*100:.1f}%"],
176
+ ['Negative', str(dist.get('Negative', 0)), f"{dist.get('Negative', 0)/results.get('total_articles', 1)*100:.1f}%"],
177
+ ['Neutral', str(dist.get('Neutral', 0)), f"{dist.get('Neutral', 0)/results.get('total_articles', 1)*100:.1f}%"]
178
+ ]
179
+
180
+ sentiment_table = Table(sentiment_data)
181
+ sentiment_table.setStyle(TableStyle([
182
+ ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
183
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
184
+ ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
185
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
186
+ ('FONTSIZE', (0, 0), (-1, 0), 12),
187
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
188
+ ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
189
+ ('GRID', (0, 0), (-1, -1), 1, colors.black)
190
+ ]))
191
+
192
+ story.append(sentiment_table)
193
+ story.append(Spacer(1, 0.2*inch))
194
+
195
+ # Add sentiment analysis explanation
196
+ explanation = "Sentiment analysis was performed using multiple models including VADER, Loughran-McDonald financial dictionary, and FinBERT. "
197
+ explanation += "Scores range from -1.0 (most negative) to +1.0 (most positive), with scores between -0.1 and +0.1 considered neutral."
198
+
199
+ story.append(Paragraph(explanation, styles['Normal']))
200
+ story.append(Spacer(1, 0.2*inch))
201
+
202
+ except Exception as e:
203
+ logger.error(f"Sentiment section creation failed: {str(e)}")
204
+ story.append(Paragraph("Sentiment analysis data unavailable.", styles['Normal']))
205
+
206
+ return story
207
+
208
+ def _create_stories_section(results: Dict[str, Any], styles) -> List:
209
+ """Create top stories section"""
210
+ story = []
211
+
212
+ try:
213
+ articles = results.get('articles', [])
214
+ if not articles:
215
+ story.append(Paragraph("No articles available for analysis.", styles['Normal']))
216
+ return story
217
+
218
+ # Sort articles by sentiment score
219
+ sorted_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0), reverse=True)
220
+
221
+ # Most positive story
222
+ if sorted_articles and sorted_articles[0].get('sentiment', {}).get('compound', 0) > 0.1:
223
+ story.append(Paragraph("Most Positive Coverage:", styles['Heading3']))
224
+ top_positive = sorted_articles[0]
225
+ story.append(Paragraph(f"<b>Title:</b> {top_positive.get('title', 'N/A')}", styles['Normal']))
226
+ story.append(Paragraph(f"<b>Source:</b> {top_positive.get('source', 'N/A')}", styles['Normal']))
227
+ story.append(Paragraph(f"<b>Sentiment Score:</b> {top_positive.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
228
+ if 'summary' in top_positive:
229
+ story.append(Paragraph(f"<b>Summary:</b> {top_positive['summary'][:300]}...", styles['Normal']))
230
+ story.append(Spacer(1, 0.2*inch))
231
+
232
+ # Most negative story
233
+ negative_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0))
234
+ if negative_articles and negative_articles[0].get('sentiment', {}).get('compound', 0) < -0.1:
235
+ story.append(Paragraph("Most Negative Coverage:", styles['Heading3']))
236
+ top_negative = negative_articles[0]
237
+ story.append(Paragraph(f"<b>Title:</b> {top_negative.get('title', 'N/A')}", styles['Normal']))
238
+ story.append(Paragraph(f"<b>Source:</b> {top_negative.get('source', 'N/A')}", styles['Normal']))
239
+ story.append(Paragraph(f"<b>Sentiment Score:</b> {top_negative.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
240
+ if 'summary' in top_negative:
241
+ story.append(Paragraph(f"<b>Summary:</b> {top_negative['summary'][:300]}...", styles['Normal']))
242
+ story.append(Spacer(1, 0.2*inch))
243
+
244
+ # Recent stories (if dates available)
245
+ recent_articles = [a for a in articles if a.get('date')]
246
+ if recent_articles:
247
+ recent_articles.sort(key=lambda x: x.get('date', ''), reverse=True)
248
+ story.append(Paragraph("Most Recent Coverage:", styles['Heading3']))
249
+ recent = recent_articles[0]
250
+ story.append(Paragraph(f"<b>Title:</b> {recent.get('title', 'N/A')}", styles['Normal']))
251
+ story.append(Paragraph(f"<b>Source:</b> {recent.get('source', 'N/A')}", styles['Normal']))
252
+ story.append(Paragraph(f"<b>Date:</b> {recent.get('date', 'N/A')}", styles['Normal']))
253
+ story.append(Paragraph(f"<b>Sentiment Score:</b> {recent.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
254
+
255
+ except Exception as e:
256
+ logger.error(f"Stories section creation failed: {str(e)}")
257
+ story.append(Paragraph("Story analysis data unavailable.", styles['Normal']))
258
+
259
+ return story
260
+
261
+ def _create_keywords_section(results: Dict[str, Any], styles) -> List:
262
+ """Create keywords section"""
263
+ story = []
264
+
265
+ try:
266
+ keywords = results.get('keywords', [])[:15] # Top 15 keywords
267
+
268
+ if not keywords:
269
+ story.append(Paragraph("No keywords extracted.", styles['Normal']))
270
+ return story
271
+
272
+ # Create keywords table
273
+ keyword_data = [['Keyword', 'Relevance Score', 'Category']]
274
+
275
+ for kw in keywords:
276
+ relevance = kw.get('relevance', 'medium')
277
+ score = kw.get('score', 0)
278
+ keyword_data.append([
279
+ kw.get('keyword', 'N/A'),
280
+ f"{score:.3f}",
281
+ relevance.title()
282
+ ])
283
+
284
+ keyword_table = Table(keyword_data)
285
+ keyword_table.setStyle(TableStyle([
286
+ ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
287
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
288
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
289
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
290
+ ('FONTSIZE', (0, 0), (-1, 0), 10),
291
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
292
+ ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
293
+ ('GRID', (0, 0), (-1, -1), 1, colors.black)
294
+ ]))
295
+
296
+ story.append(keyword_table)
297
+ story.append(Spacer(1, 0.2*inch))
298
+
299
+ # Keywords explanation
300
+ explanation = "Keywords were extracted using the YAKE (Yet Another Keyword Extractor) algorithm, "
301
+ explanation += "which identifies the most relevant terms and phrases based on statistical analysis of the text corpus."
302
+
303
+ story.append(Paragraph(explanation, styles['Normal']))
304
+
305
+ except Exception as e:
306
+ logger.error(f"Keywords section creation failed: {str(e)}")
307
+ story.append(Paragraph("Keyword analysis data unavailable.", styles['Normal']))
308
+
309
+ return story
310
+
311
+ def _create_sources_section(results: Dict[str, Any], styles) -> List:
312
+ """Create news sources section"""
313
+ story = []
314
+
315
+ try:
316
+ articles = results.get('articles', [])
317
+
318
+ if not articles:
319
+ story.append(Paragraph("No source data available.", styles['Normal']))
320
+ return story
321
+
322
+ # Count sources
323
+ source_counts = {}
324
+ for article in articles:
325
+ source = article.get('source', 'Unknown')
326
+ source_counts[source] = source_counts.get(source, 0) + 1
327
+
328
+ # Create sources table
329
+ source_data = [['News Source', 'Article Count', 'Percentage']]
330
+ total_articles = len(articles)
331
+
332
+ for source, count in sorted(source_counts.items(), key=lambda x: x[1], reverse=True):
333
+ percentage = (count / total_articles) * 100
334
+ source_data.append([source, str(count), f"{percentage:.1f}%"])
335
+
336
+ sources_table = Table(source_data)
337
+ sources_table.setStyle(TableStyle([
338
+ ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
339
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
340
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
341
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
342
+ ('FONTSIZE', (0, 0), (-1, 0), 10),
343
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
344
+ ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
345
+ ('GRID', (0, 0), (-1, -1), 1, colors.black)
346
+ ]))
347
+
348
+ story.append(sources_table)
349
+ story.append(Spacer(1, 0.2*inch))
350
+
351
+ # Sources explanation
352
+ explanation = f"Articles were collected from {len(source_counts)} different news sources, "
353
+ explanation += "providing diverse perspectives on the analyzed topic. Source diversity helps ensure comprehensive coverage and reduces bias."
354
+
355
+ story.append(Paragraph(explanation, styles['Normal']))
356
+
357
+ except Exception as e:
358
+ logger.error(f"Sources section creation failed: {str(e)}")
359
+ story.append(Paragraph("Source analysis data unavailable.", styles['Normal']))
360
+
361
+ return story
362
+
363
+ def _create_methodology_section(results: Dict[str, Any]) -> str:
364
+ """Create methodology section text"""
365
+ methodology = "This analysis employed a comprehensive natural language processing pipeline:\n\n"
366
+
367
+ methodology += "1. <b>Data Collection:</b> News articles were scraped from multiple reliable sources using RSS feeds and web scraping techniques. "
368
+ methodology += "Content was filtered for relevance and deduplicated to ensure quality.\n\n"
369
+
370
+ methodology += "2. <b>Sentiment Analysis:</b> Three complementary models were used: "
371
+ methodology += "VADER (general sentiment), Loughran-McDonald dictionary (financial sentiment), and FinBERT (financial domain-specific). "
372
+ methodology += "Final scores represent a weighted combination of all models.\n\n"
373
+
374
+ methodology += "3. <b>Text Processing:</b> Articles were cleaned, summarized using transformer models, and analyzed for key themes. "
375
+ methodology += "Keyword extraction employed the YAKE algorithm for statistical relevance.\n\n"
376
+
377
+ methodology += "4. <b>Quality Assurance:</b> All content was filtered for English language, minimum length requirements, and relevance to the query terms. "
378
+ methodology += "Results were validated across multiple model outputs for consistency.\n\n"
379
+
380
+ if results.get('processing_time'):
381
+ methodology += f"Total processing time: {results['processing_time']:.2f} seconds for {results.get('total_articles', 0)} articles."
382
+
383
+ return methodology
384
+
385
+ def _generate_simple_pdf_fallback(results: Dict[str, Any]) -> io.BytesIO:
386
+ """Generate a simple text-based PDF fallback"""
387
+ try:
388
+ from fpdf import FPDF
389
+
390
+ pdf = FPDF()
391
+ pdf.add_page()
392
+ pdf.set_font('Arial', 'B', 16)
393
+ pdf.cell(40, 10, 'News Analysis Report')
394
+ pdf.ln(20)
395
+
396
+ pdf.set_font('Arial', '', 12)
397
+ pdf.cell(40, 10, f"Query: {results.get('query', 'N/A')}")
398
+ pdf.ln(10)
399
+ pdf.cell(40, 10, f"Articles: {results.get('total_articles', 0)}")
400
+ pdf.ln(10)
401
+ pdf.cell(40, 10, f"Average Sentiment: {results.get('average_sentiment', 0):.3f}")
402
+ pdf.ln(20)
403
+
404
+ # Simple sentiment distribution
405
+ dist = results.get('sentiment_distribution', {})
406
+ pdf.cell(40, 10, 'Sentiment Distribution:')
407
+ pdf.ln(10)
408
+ pdf.cell(40, 10, f"Positive: {dist.get('Positive', 0)}")
409
+ pdf.ln(10)
410
+ pdf.cell(40, 10, f"Negative: {dist.get('Negative', 0)}")
411
+ pdf.ln(10)
412
+ pdf.cell(40, 10, f"Neutral: {dist.get('Neutral', 0)}")
413
+
414
+ # Save to buffer
415
+ buffer = io.BytesIO()
416
+ pdf_string = pdf.output(dest='S').encode('latin1')
417
+ buffer.write(pdf_string)
418
+ buffer.seek(0)
419
+
420
+ return buffer
421
+
422
+ except Exception as e:
423
+ logger.error(f"PDF fallback failed: {str(e)}")
424
+ # Return empty buffer as last resort
425
+ buffer = io.BytesIO()
426
+ buffer.write(b"PDF generation failed. Please check logs.")
427
+ buffer.seek(0)
428
+ return buffer
429
+
430
+ def create_chart_image(data: Dict, chart_type: str = 'pie') -> Optional[str]:
431
+ """Create a chart image for PDF inclusion"""
432
+ if not MATPLOTLIB_AVAILABLE:
433
+ return None
434
+
435
+ try:
436
+ plt.figure(figsize=(6, 4))
437
+
438
+ if chart_type == 'pie' and 'sentiment_distribution' in data:
439
+ dist = data['sentiment_distribution']
440
+ labels = ['Positive', 'Negative', 'Neutral']
441
+ sizes = [dist.get('Positive', 0), dist.get('Negative', 0), dist.get('Neutral', 0)]
442
+ colors = ['#28a745', '#dc3545', '#6c757d']
443
+
444
+ plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
445
+ plt.title('Sentiment Distribution')
446
+
447
+ elif chart_type == 'bar' and 'articles' in data:
448
+ articles = data['articles']
449
+ sources = {}
450
+ for article in articles:
451
+ source = article.get('source', 'Unknown')
452
+ sources[source] = sources.get(source, 0) + 1
453
+
454
+ # Top 10 sources
455
+ top_sources = dict(sorted(sources.items(), key=lambda x: x[1], reverse=True)[:10])
456
+
457
+ plt.bar(range(len(top_sources)), list(top_sources.values()), color='#2E86AB')
458
+ plt.xticks(range(len(top_sources)), list(top_sources.keys()), rotation=45, ha='right')
459
+ plt.title('Articles by Source')
460
+ plt.ylabel('Article Count')
461
+ plt.tight_layout()
462
+
463
+ # Save to base64 string
464
+ buffer = io.BytesIO()
465
+ plt.savefig(buffer, format='png', dpi=150, bbox_inches='tight')
466
+ buffer.seek(0)
467
+
468
+ image_base64 = base64.b64encode(buffer.getvalue()).decode()
469
+ plt.close()
470
+
471
+ return image_base64
472
+
473
+ except Exception as e:
474
+ logger.error(f"Chart creation failed: {str(e)}")
475
+ return None
476
+
477
+ def generate_csv_report(results: Dict[str, Any]) -> str:
478
+ """Generate CSV report"""
479
+ try:
480
+ import csv
481
+ import io
482
+
483
+ output = io.StringIO()
484
+ writer = csv.writer(output)
485
+
486
+ # Write header
487
+ writer.writerow([
488
+ 'Title', 'Source', 'URL', 'Date', 'Sentiment_Score', 'Sentiment_Label',
489
+ 'VADER_Score', 'LM_Score', 'FinBERT_Score', 'Summary'
490
+ ])
491
+
492
+ # Write article data
493
+ articles = results.get('articles', [])
494
+ for article in articles:
495
+ sentiment = article.get('sentiment', {})
496
+ compound = sentiment.get('compound', 0)
497
+
498
+ # Determine sentiment label
499
+ if compound > 0.1:
500
+ label = 'Positive'
501
+ elif compound < -0.1:
502
+ label = 'Negative'
503
+ else:
504
+ label = 'Neutral'
505
+
506
+ writer.writerow([
507
+ article.get('title', ''),
508
+ article.get('source', ''),
509
+ article.get('url', ''),
510
+ article.get('date', ''),
511
+ compound,
512
+ label,
513
+ sentiment.get('vader', ''),
514
+ sentiment.get('loughran_mcdonald', ''),
515
+ sentiment.get('finbert', ''),
516
+ article.get('summary', '')[:200] + '...' if len(article.get('summary', '')) > 200 else article.get('summary', '')
517
+ ])
518
+
519
+ return output.getvalue()
520
+
521
+ except Exception as e:
522
+ logger.error(f"CSV generation failed: {str(e)}")
523
+ return "Error generating CSV report"
524
+
525
+ def generate_json_report(results: Dict[str, Any]) -> str:
526
+ """Generate JSON report with formatted output"""
527
+ try:
528
+ import json
529
+ from datetime import datetime
530
+
531
+ # Create comprehensive report
532
+ report = {
533
+ 'metadata': {
534
+ 'report_generated': datetime.now().isoformat(),
535
+ 'query': results.get('query', ''),
536
+ 'total_articles': results.get('total_articles', 0),
537
+ 'processing_time_seconds': results.get('processing_time', 0),
538
+ 'languages': results.get('languages', ['English'])
539
+ },
540
+ 'summary': {
541
+ 'average_sentiment': results.get('average_sentiment', 0),
542
+ 'sentiment_distribution': results.get('sentiment_distribution', {}),
543
+ 'top_sources': _get_top_sources(results),
544
+ 'date_range': results.get('summary', {}).get('date_range', {})
545
+ },
546
+ 'articles': results.get('articles', []),
547
+ 'keywords': results.get('keywords', [])[:20], # Top 20 keywords
548
+ 'analysis_methods': {
549
+ 'sentiment_models': ['VADER', 'Loughran-McDonald', 'FinBERT'],
550
+ 'summarization_model': 'DistilBART',
551
+ 'keyword_extraction': 'YAKE',
552
+ 'translation_models': ['Helsinki-NLP Opus-MT']
553
+ }
554
+ }
555
+
556
+ return json.dumps(report, indent=2, default=str, ensure_ascii=False)
557
+
558
+ except Exception as e:
559
+ logger.error(f"JSON generation failed: {str(e)}")
560
+ return json.dumps({'error': str(e)}, indent=2)
561
+
562
+ def _get_top_sources(results: Dict[str, Any]) -> List[Dict[str, Any]]:
563
+ """Get top news sources from results"""
564
+ try:
565
+ articles = results.get('articles', [])
566
+ sources = {}
567
+
568
+ for article in articles:
569
+ source = article.get('source', 'Unknown')
570
+ sources[source] = sources.get(source, 0) + 1
571
+
572
+ # Convert to list and sort
573
+ source_list = [
574
+ {'source': source, 'count': count, 'percentage': round((count / len(articles)) * 100, 1)}
575
+ for source, count in sources.items()
576
+ ]
577
+
578
+ return sorted(source_list, key=lambda x: x['count'], reverse=True)[:10]
579
+
580
+ except Exception as e:
581
+ logger.error(f"Top sources calculation failed: {str(e)}")
582
+ return []
583
+
584
+ def validate_report_data(results: Dict[str, Any]) -> bool:
585
+ """Validate that results contain required data for reporting"""
586
+ required_keys = ['query', 'articles', 'total_articles']
587
+
588
+ for key in required_keys:
589
+ if key not in results:
590
+ logger.error(f"Missing required key for reporting: {key}")
591
+ return False
592
+
593
+ if not isinstance(results['articles'], list) or len(results['articles']) == 0:
594
+ logger.error("No articles available for reporting")
595
+ return False
596
+
597
+ return True
598
+
599
+ # Export functions
600
+ __all__ = [
601
+ 'generate_pdf_report',
602
+ 'generate_csv_report',
603
+ 'generate_json_report',
604
+ 'create_chart_image',
605
+ 'validate_report_data'
606
+ ]
requirements_file.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Framework
2
+ streamlit==1.28.1
3
+ fastapi==0.104.1
4
+ uvicorn==0.24.0
5
+
6
+ # Web Scraping & RSS
7
+ requests==2.31.0
8
+ beautifulsoup4==4.12.2
9
+ feedparser==6.0.10
10
+ trafilatura==1.6.2
11
+ lxml==4.9.3
12
+
13
+ # NLP & Machine Learning
14
+ transformers==4.35.2
15
+ torch==2.1.0 --index-url https://download.pytorch.org/whl/cpu
16
+ nltk==3.8.1
17
+ langdetect==1.0.9
18
+ yake==0.4.8
19
+ vaderSentiment==3.3.2
20
+
21
+ # Data Processing
22
+ pandas==2.0.3
23
+ numpy==1.24.3
24
+
25
+ # Visualization
26
+ plotly==5.17.0
27
+ matplotlib==3.7.2
28
+ wordcloud==1.9.2
29
+
30
+ # Translation & Audio
31
+ gtts==2.4.0
32
+
33
+ # Report Generation
34
+ reportlab==4.0.4
35
+ fpdf2==2.7.6
36
+
37
+ # Utilities
38
+ python-dotenv==1.0.0
39
+ psutil==5.9.5
40
+ Pillow==10.0.1
41
+
42
+ # HTTP & Async
43
+ httpx==0.25.0
44
+ aiofiles==23.2.1
45
+
46
+ # Caching
47
+ diskcache==5.6.3
scraper_module.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import feedparser
4
+ import trafilatura
5
+ from urllib.parse import urljoin, urlparse
6
+ import time
7
+ import logging
8
+ from datetime import datetime, timedelta
9
+ from typing import List, Dict, Optional, Set
10
+ import hashlib
11
+ import re
12
+ from langdetect import detect
13
+ import random
14
+ from requests.adapters import HTTPAdapter
15
+ from urllib3.util.retry import Retry
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class NewsletterScraper:
20
+ """Robust news scraper with multiple sources and deduplication"""
21
+
22
+ def __init__(self):
23
+ self.session = self._create_session()
24
+ self.scraped_urls: Set[str] = set()
25
+ self.content_hashes: Set[str] = set()
26
+
27
+ # News sources configuration
28
+ self.rss_sources = {
29
+ 'google_news': 'https://news.google.com/rss/search?q={}&hl=en&gl=US&ceid=US:en',
30
+ 'yahoo_finance': 'https://feeds.finance.yahoo.com/rss/2.0/headline',
31
+ 'reuters_business': 'https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best',
32
+ 'bbc_business': 'http://feeds.bbci.co.uk/news/business/rss.xml',
33
+ 'cnbc': 'https://www.cnbc.com/id/100003114/device/rss/rss.html',
34
+ 'marketwatch': 'http://feeds.marketwatch.com/marketwatch/topstories/',
35
+ 'financial_times': 'https://www.ft.com/rss/home',
36
+ 'bloomberg': 'https://feeds.bloomberg.com/politics/news.rss'
37
+ }
38
+
39
+ self.user_agents = [
40
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
41
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
42
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
43
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
44
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
45
+ ]
46
+
47
+ logger.info("NewsletterScraper initialized")
48
+
49
+ def _create_session(self) -> requests.Session:
50
+ """Create a session with retry strategy"""
51
+ session = requests.Session()
52
+
53
+ # Retry strategy
54
+ retry_strategy = Retry(
55
+ total=3,
56
+ backoff_factor=1,
57
+ status_forcelist=[429, 500, 502, 503, 504],
58
+ )
59
+
60
+ adapter = HTTPAdapter(max_retries=retry_strategy)
61
+ session.mount("http://", adapter)
62
+ session.mount("https://", adapter)
63
+
64
+ return session
65
+
66
+ def _get_random_headers(self) -> Dict[str, str]:
67
+ """Get randomized headers to avoid blocking"""
68
+ return {
69
+ 'User-Agent': random.choice(self.user_agents),
70
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
71
+ 'Accept-Language': 'en-US,en;q=0.5',
72
+ 'Accept-Encoding': 'gzip, deflate',
73
+ 'Connection': 'keep-alive',
74
+ 'Upgrade-Insecure-Requests': '1',
75
+ }
76
+
77
+ def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
78
+ """Main scraping function"""
79
+ logger.info(f"Starting news scraping for query: {query}")
80
+
81
+ all_articles = []
82
+ self.scraped_urls.clear()
83
+ self.content_hashes.clear()
84
+
85
+ try:
86
+ # Primary: Google News RSS
87
+ google_articles = self._scrape_google_news(query, max_articles // 2)
88
+ all_articles.extend(google_articles)
89
+
90
+ # Secondary: Other RSS sources
91
+ for source_name, rss_url in list(self.rss_sources.items())[1:4]: # Limit to avoid timeouts
92
+ if len(all_articles) >= max_articles:
93
+ break
94
+
95
+ try:
96
+ source_articles = self._scrape_rss_source(rss_url, query, 5)
97
+ all_articles.extend(source_articles)
98
+ except Exception as e:
99
+ logger.warning(f"Failed to scrape {source_name}: {str(e)}")
100
+ continue
101
+
102
+ # Deduplicate and filter
103
+ articles = self._deduplicate_articles(all_articles)
104
+ articles = self._filter_articles(articles, query)
105
+ articles = articles[:max_articles]
106
+
107
+ # Extract full content
108
+ for article in articles:
109
+ try:
110
+ full_content = self._extract_full_content(article['url'])
111
+ if full_content and len(full_content) > 200:
112
+ article['content'] = full_content
113
+ else:
114
+ article['content'] = article.get('summary', article.get('title', ''))
115
+ except Exception as e:
116
+ logger.warning(f"Failed to extract content from {article['url']}: {str(e)}")
117
+ article['content'] = article.get('summary', article.get('title', ''))
118
+
119
+ # Filter by language (English only)
120
+ articles = [article for article in articles if self._is_english(article['content'])]
121
+
122
+ logger.info(f"Successfully scraped {len(articles)} articles")
123
+ return articles
124
+
125
+ except Exception as e:
126
+ logger.error(f"Error in scrape_news: {str(e)}")
127
+ return []
128
+
129
+ def _scrape_google_news(self, query: str, max_articles: int) -> List[Dict]:
130
+ """Scrape Google News RSS"""
131
+ try:
132
+ url = self.rss_sources['google_news'].format(query.replace(' ', '%20'))
133
+
134
+ headers = self._get_random_headers()
135
+ response = self.session.get(url, headers=headers, timeout=10)
136
+
137
+ if response.status_code != 200:
138
+ logger.warning(f"Google News RSS returned status {response.status_code}")
139
+ return []
140
+
141
+ feed = feedparser.parse(response.content)
142
+ articles = []
143
+
144
+ for entry in feed.entries[:max_articles * 2]: # Get extra for filtering
145
+ try:
146
+ article = {
147
+ 'title': entry.title,
148
+ 'url': entry.link,
149
+ 'summary': entry.get('summary', ''),
150
+ 'date': self._parse_date(entry.get('published', '')),
151
+ 'source': 'Google News'
152
+ }
153
+
154
+ # Skip if already seen
155
+ if article['url'] in self.scraped_urls:
156
+ continue
157
+
158
+ self.scraped_urls.add(article['url'])
159
+ articles.append(article)
160
+
161
+ except Exception as e:
162
+ logger.warning(f"Error parsing Google News entry: {str(e)}")
163
+ continue
164
+
165
+ return articles
166
+
167
+ except Exception as e:
168
+ logger.error(f"Error scraping Google News: {str(e)}")
169
+ return []
170
+
171
+ def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
172
+ """Scrape a generic RSS source"""
173
+ try:
174
+ headers = self._get_random_headers()
175
+ response = self.session.get(rss_url, headers=headers, timeout=10)
176
+
177
+ if response.status_code != 200:
178
+ return []
179
+
180
+ feed = feedparser.parse(response.content)
181
+ articles = []
182
+ query_lower = query.lower()
183
+
184
+ for entry in feed.entries[:max_articles * 3]: # Get extra for filtering
185
+ try:
186
+ title = entry.get('title', '')
187
+ summary = entry.get('summary', '')
188
+
189
+ # Check if article is relevant to query
190
+ if not (query_lower in title.lower() or query_lower in summary.lower()):
191
+ continue
192
+
193
+ article = {
194
+ 'title': title,
195
+ 'url': entry.get('link', ''),
196
+ 'summary': summary,
197
+ 'date': self._parse_date(entry.get('published', '')),
198
+ 'source': self._extract_source_name(rss_url)
199
+ }
200
+
201
+ # Skip if already seen
202
+ if article['url'] in self.scraped_urls:
203
+ continue
204
+
205
+ self.scraped_urls.add(article['url'])
206
+ articles.append(article)
207
+
208
+ if len(articles) >= max_articles:
209
+ break
210
+
211
+ except Exception as e:
212
+ logger.warning(f"Error parsing RSS entry: {str(e)}")
213
+ continue
214
+
215
+ # Small delay to be respectful
216
+ time.sleep(0.5)
217
+
218
+ return articles
219
+
220
+ except Exception as e:
221
+ logger.error(f"Error scraping RSS {rss_url}: {str(e)}")
222
+ return []
223
+
224
+ def _extract_full_content(self, url: str) -> Optional[str]:
225
+ """Extract full article content using trafilatura"""
226
+ try:
227
+ headers = self._get_random_headers()
228
+
229
+ # Download the page
230
+ downloaded = trafilatura.fetch_url(url, headers=headers)
231
+
232
+ if not downloaded:
233
+ return None
234
+
235
+ # Extract text content
236
+ text = trafilatura.extract(
237
+ downloaded,
238
+ include_comments=False,
239
+ include_tables=False,
240
+ include_formatting=False,
241
+ no_fallback=False
242
+ )
243
+
244
+ if text and len(text.strip()) > 100:
245
+ return text.strip()
246
+
247
+ return None
248
+
249
+ except Exception as e:
250
+ logger.warning(f"Error extracting content from {url}: {str(e)}")
251
+ return None
252
+
253
+ def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
254
+ """Remove duplicate articles based on content similarity"""
255
+ unique_articles = []
256
+
257
+ for article in articles:
258
+ # Create content hash
259
+ content_for_hash = f"{article['title']} {article.get('summary', '')}"
260
+ content_hash = hashlib.md5(content_for_hash.encode()).hexdigest()
261
+
262
+ if content_hash not in self.content_hashes:
263
+ self.content_hashes.add(content_hash)
264
+ unique_articles.append(article)
265
+
266
+ logger.info(f"Deduplicated {len(articles)} -> {len(unique_articles)} articles")
267
+ return unique_articles
268
+
269
+ def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
270
+ """Filter articles for relevance and quality"""
271
+ filtered_articles = []
272
+ query_lower = query.lower()
273
+
274
+ for article in articles:
275
+ # Check minimum content length
276
+ title_summary = f"{article['title']} {article.get('summary', '')}"
277
+ if len(title_summary.strip()) < 50:
278
+ continue
279
+
280
+ # Check relevance (more flexible than RSS filtering)
281
+ if (query_lower in article['title'].lower() or
282
+ query_lower in article.get('summary', '').lower() or
283
+ any(word in article['title'].lower() for word in query_lower.split())):
284
+
285
+ filtered_articles.append(article)
286
+
287
+ logger.info(f"Filtered {len(articles)} -> {len(filtered_articles)} articles for relevance")
288
+ return filtered_articles
289
+
290
+ def _is_english(self, text: str) -> bool:
291
+ """Check if text is in English using language detection"""
292
+ try:
293
+ if len(text.strip()) < 20:
294
+ return True # Assume short text is English
295
+
296
+ detected_lang = detect(text[:1000]) # Check first 1000 chars
297
+ return detected_lang == 'en'
298
+
299
+ except Exception:
300
+ # If detection fails, assume English
301
+ return True
302
+
303
+ def _parse_date(self, date_str: str) -> Optional[datetime]:
304
+ """Parse date from RSS feed"""
305
+ if not date_str:
306
+ return datetime.now()
307
+
308
+ try:
309
+ # Try common RSS date formats
310
+ for fmt in ['%a, %d %b %Y %H:%M:%S %Z',
311
+ '%Y-%m-%dT%H:%M:%SZ',
312
+ '%Y-%m-%d %H:%M:%S']:
313
+ try:
314
+ return datetime.strptime(date_str.strip(), fmt)
315
+ except ValueError:
316
+ continue
317
+
318
+ # If all fails, return current time
319
+ return datetime.now()
320
+
321
+ except Exception:
322
+ return datetime.now()
323
+
324
+ def _extract_source_name(self, url: str) -> str:
325
+ """Extract source name from URL"""
326
+ try:
327
+ domain = urlparse(url).netloc
328
+
329
+ # Clean up common domain patterns
330
+ domain = domain.replace('www.', '').replace('feeds.', '')
331
+
332
+ # Map known domains to clean names
333
+ domain_mapping = {
334
+ 'news.google.com': 'Google News',
335
+ 'finance.yahoo.com': 'Yahoo Finance',
336
+ 'reuters.com': 'Reuters',
337
+ 'reutersagency.com': 'Reuters',
338
+ 'bbc.co.uk': 'BBC',
339
+ 'cnbc.com': 'CNBC',
340
+ 'marketwatch.com': 'MarketWatch',
341
+ 'ft.com': 'Financial Times',
342
+ 'bloomberg.com': 'Bloomberg'
343
+ }
344
+
345
+ return domain_mapping.get(domain, domain.title())
346
+
347
+ except Exception:
348
+ return 'Unknown'
349
+
350
+ def get_available_sources(self) -> List[str]:
351
+ """Get list of available news sources"""
352
+ return list(self.rss_sources.keys())
353
+
354
+ # Additional utility functions for scraping
355
+ def clean_html(html_content: str) -> str:
356
+ """Clean HTML content and extract text"""
357
+ try:
358
+ soup = BeautifulSoup(html_content, 'html.parser')
359
+
360
+ # Remove script and style elements
361
+ for script in soup(["script", "style"]):
362
+ script.extract()
363
+
364
+ # Get text
365
+ text = soup.get_text()
366
+
367
+ # Clean up whitespace
368
+ lines = (line.strip() for line in text.splitlines())
369
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
370
+ text = ' '.join(chunk for chunk in chunks if chunk)
371
+
372
+ return text
373
+
374
+ except Exception as e:
375
+ logger.error(f"Error cleaning HTML: {str(e)}")
376
+ return ""
377
+
378
+ def is_valid_article_url(url: str) -> bool:
379
+ """Check if URL is likely to be a valid article URL"""
380
+ try:
381
+ parsed = urlparse(url)
382
+
383
+ # Skip certain file types
384
+ skip_extensions = ['.pdf', '.jpg', '.png', '.gif', '.mp4', '.mp3']
385
+ if any(url.lower().endswith(ext) for ext in skip_extensions):
386
+ return False
387
+
388
+ # Skip obvious non-article URLs
389
+ skip_patterns = ['login', 'register', 'subscribe', 'newsletter', 'sitemap']
390
+ if any(pattern in url.lower() for pattern in skip_patterns):
391
+ return False
392
+
393
+ return True
394
+
395
+ except Exception:
396
+ return False
streamlit_app.py ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from wordcloud import WordCloud
6
+ import matplotlib.pyplot as plt
7
+ import asyncio
8
+ import json
9
+ import base64
10
+ from datetime import datetime
11
+ import io
12
+ import os
13
+
14
+ # Import our modules
15
+ from api import NewsAnalyzer
16
+ from utils import load_config, cache_results
17
+ from report import generate_pdf_report
18
+
19
+ # Configure page
20
+ st.set_page_config(
21
+ page_title="Global Business News Intelligence Dashboard",
22
+ page_icon="📊",
23
+ layout="wide",
24
+ initial_sidebar_state="expanded"
25
+ )
26
+
27
+ # Custom CSS
28
+ st.markdown("""
29
+ <style>
30
+ .main-header {
31
+ font-size: 2.5rem;
32
+ font-weight: bold;
33
+ text-align: center;
34
+ color: #2E86AB;
35
+ margin-bottom: 2rem;
36
+ }
37
+ .metric-card {
38
+ background-color: #f0f2f6;
39
+ padding: 1rem;
40
+ border-radius: 10px;
41
+ border-left: 4px solid #2E86AB;
42
+ }
43
+ .sentiment-positive { color: #28a745; font-weight: bold; }
44
+ .sentiment-negative { color: #dc3545; font-weight: bold; }
45
+ .sentiment-neutral { color: #6c757d; font-weight: bold; }
46
+ .audio-container {
47
+ background-color: #f8f9fa;
48
+ padding: 10px;
49
+ border-radius: 5px;
50
+ margin: 10px 0;
51
+ }
52
+ </style>
53
+ """, unsafe_allow_html=True)
54
+
55
+ # Initialize session state
56
+ if 'analyzer' not in st.session_state:
57
+ st.session_state.analyzer = NewsAnalyzer()
58
+ if 'results' not in st.session_state:
59
+ st.session_state.results = None
60
+ if 'analysis_complete' not in st.session_state:
61
+ st.session_state.analysis_complete = False
62
+
63
+ def main():
64
+ # Header
65
+ st.markdown('<h1 class="main-header">🌐 Global Business News Intelligence Dashboard</h1>', unsafe_allow_html=True)
66
+ st.markdown("**Real-time sentiment analysis, multilingual summaries, and audio insights for business intelligence**")
67
+
68
+ # Sidebar
69
+ with st.sidebar:
70
+ st.header("⚙️ Configuration")
71
+
72
+ # Input section
73
+ st.subheader("🎯 Target Analysis")
74
+ query_type = st.selectbox("Query Type", ["Company", "Stock Ticker", "Keyword", "Industry"])
75
+ query = st.text_input(f"Enter {query_type}:", placeholder="e.g., Tesla, TSLA, AI technology")
76
+
77
+ st.subheader("📊 Analysis Settings")
78
+ num_articles = st.slider("Number of Articles", 5, 50, 20)
79
+ languages = st.multiselect(
80
+ "Summary Languages",
81
+ ["English", "Hindi", "Tamil"],
82
+ default=["English"]
83
+ )
84
+ include_audio = st.checkbox("Generate Audio Summaries", True)
85
+
86
+ st.subheader("🔧 Model Settings")
87
+ sentiment_models = st.multiselect(
88
+ "Sentiment Models",
89
+ ["VADER", "Loughran-McDonald", "FinBERT"],
90
+ default=["VADER", "Loughran-McDonald", "FinBERT"]
91
+ )
92
+
93
+ # Analysis button
94
+ analyze_button = st.button("🚀 Analyze News", type="primary", use_container_width=True)
95
+
96
+ # Main content area
97
+ if analyze_button and query:
98
+ st.session_state.analysis_complete = False
99
+ with st.spinner("🔍 Analyzing news articles... This may take a few minutes."):
100
+ try:
101
+ # Create progress bar
102
+ progress_bar = st.progress(0)
103
+ status_text = st.empty()
104
+
105
+ # Run analysis
106
+ config = {
107
+ 'query': query,
108
+ 'num_articles': num_articles,
109
+ 'languages': languages,
110
+ 'include_audio': include_audio,
111
+ 'sentiment_models': sentiment_models
112
+ }
113
+
114
+ # Update progress
115
+ status_text.text("🔍 Scraping articles...")
116
+ progress_bar.progress(20)
117
+
118
+ results = st.session_state.analyzer.analyze_news(config, progress_callback=update_progress)
119
+ st.session_state.results = results
120
+ st.session_state.analysis_complete = True
121
+
122
+ progress_bar.progress(100)
123
+ status_text.text("✅ Analysis complete!")
124
+
125
+ except Exception as e:
126
+ st.error(f"Error during analysis: {str(e)}")
127
+ st.session_state.analysis_complete = False
128
+
129
+ # Display results
130
+ if st.session_state.analysis_complete and st.session_state.results:
131
+ display_results(st.session_state.results)
132
+
133
+ elif not st.session_state.analysis_complete and query:
134
+ st.info("👆 Click 'Analyze News' to start the analysis")
135
+
136
+ else:
137
+ show_demo_dashboard()
138
+
139
+ def update_progress(progress, status):
140
+ """Callback function for progress updates"""
141
+ # This would be called from the analyzer
142
+ pass
143
+
144
+ def display_results(results):
145
+ """Display analysis results with interactive dashboard"""
146
+ st.header(f"📈 Analysis Results for: {results['query']}")
147
+
148
+ # Key metrics
149
+ col1, col2, col3, col4 = st.columns(4)
150
+
151
+ with col1:
152
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
153
+ st.metric("Articles Analyzed", len(results['articles']))
154
+ st.markdown('</div>', unsafe_allow_html=True)
155
+
156
+ with col2:
157
+ avg_sentiment = results['summary']['average_sentiment']
158
+ sentiment_color = "sentiment-positive" if avg_sentiment > 0.1 else "sentiment-negative" if avg_sentiment < -0.1 else "sentiment-neutral"
159
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
160
+ st.metric("Average Sentiment", f"{avg_sentiment:.3f}")
161
+ st.markdown('</div>', unsafe_allow_html=True)
162
+
163
+ with col3:
164
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
165
+ st.metric("Sources", len(set([article['source'] for article in results['articles']])))
166
+ st.markdown('</div>', unsafe_allow_html=True)
167
+
168
+ with col4:
169
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
170
+ st.metric("Languages", len(results.get('languages', ['English'])))
171
+ st.markdown('</div>', unsafe_allow_html=True)
172
+
173
+ # Tabs for different views
174
+ tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["📊 Dashboard", "📰 Articles", "🎯 Sentiment", "🗣️ Audio", "📤 Export", "🔌 API"])
175
+
176
+ with tab1:
177
+ display_dashboard(results)
178
+
179
+ with tab2:
180
+ display_articles(results)
181
+
182
+ with tab3:
183
+ display_sentiment_analysis(results)
184
+
185
+ with tab4:
186
+ display_audio_summaries(results)
187
+
188
+ with tab5:
189
+ display_export_options(results)
190
+
191
+ with tab6:
192
+ display_api_info(results)
193
+
194
+ def display_dashboard(results):
195
+ """Display main dashboard with charts"""
196
+ col1, col2 = st.columns(2)
197
+
198
+ with col1:
199
+ # Sentiment distribution
200
+ st.subheader("📊 Sentiment Distribution")
201
+ sentiment_counts = {
202
+ 'Positive': sum(1 for article in results['articles'] if article['sentiment']['compound'] > 0.1),
203
+ 'Negative': sum(1 for article in results['articles'] if article['sentiment']['compound'] < -0.1),
204
+ 'Neutral': sum(1 for article in results['articles'] if -0.1 <= article['sentiment']['compound'] <= 0.1)
205
+ }
206
+
207
+ fig_pie = px.pie(
208
+ values=list(sentiment_counts.values()),
209
+ names=list(sentiment_counts.keys()),
210
+ color_discrete_map={'Positive': '#28a745', 'Negative': '#dc3545', 'Neutral': '#6c757d'}
211
+ )
212
+ st.plotly_chart(fig_pie, use_container_width=True)
213
+
214
+ with col2:
215
+ # Source distribution
216
+ st.subheader("📰 Source Distribution")
217
+ source_counts = {}
218
+ for article in results['articles']:
219
+ source = article['source']
220
+ source_counts[source] = source_counts.get(source, 0) + 1
221
+
222
+ fig_bar = px.bar(
223
+ x=list(source_counts.keys()),
224
+ y=list(source_counts.values()),
225
+ color=list(source_counts.values()),
226
+ color_continuous_scale="viridis"
227
+ )
228
+ fig_bar.update_layout(xaxis_title="Source", yaxis_title="Article Count")
229
+ st.plotly_chart(fig_bar, use_container_width=True)
230
+
231
+ # Timeline chart
232
+ st.subheader("📈 Sentiment Over Time")
233
+ if results['articles']:
234
+ df_timeline = pd.DataFrame([
235
+ {
236
+ 'date': article.get('date', datetime.now()),
237
+ 'sentiment': article['sentiment']['compound'],
238
+ 'title': article['title'][:50] + "..." if len(article['title']) > 50 else article['title']
239
+ }
240
+ for article in results['articles']
241
+ if 'date' in article
242
+ ])
243
+
244
+ if not df_timeline.empty:
245
+ fig_timeline = px.scatter(
246
+ df_timeline,
247
+ x='date',
248
+ y='sentiment',
249
+ hover_data=['title'],
250
+ color='sentiment',
251
+ color_continuous_scale=['red', 'gray', 'green'],
252
+ color_continuous_midpoint=0
253
+ )
254
+ fig_timeline.update_layout(
255
+ xaxis_title="Date",
256
+ yaxis_title="Sentiment Score",
257
+ yaxis=dict(range=[-1, 1])
258
+ )
259
+ st.plotly_chart(fig_timeline, use_container_width=True)
260
+
261
+ # Keywords word cloud
262
+ st.subheader("🔤 Key Topics")
263
+ if 'keywords' in results and results['keywords']:
264
+ col1, col2 = st.columns([2, 1])
265
+
266
+ with col1:
267
+ # Create word cloud
268
+ keywords_text = ' '.join([kw['keyword'] for kw in results['keywords'][:50]])
269
+ if keywords_text:
270
+ wordcloud = WordCloud(
271
+ width=800,
272
+ height=400,
273
+ background_color='white',
274
+ colormap='viridis'
275
+ ).generate(keywords_text)
276
+
277
+ fig, ax = plt.subplots(figsize=(10, 5))
278
+ ax.imshow(wordcloud, interpolation='bilinear')
279
+ ax.axis('off')
280
+ st.pyplot(fig)
281
+
282
+ with col2:
283
+ st.write("**Top Keywords:**")
284
+ for i, kw in enumerate(results['keywords'][:10]):
285
+ st.write(f"{i+1}. {kw['keyword']} ({kw['score']:.3f})")
286
+
287
+ def display_articles(results):
288
+ """Display individual articles with summaries"""
289
+ st.subheader(f"📰 Articles ({len(results['articles'])})")
290
+
291
+ for i, article in enumerate(results['articles']):
292
+ with st.expander(f"📄 {article['title']}", expanded=(i < 3)):
293
+ col1, col2 = st.columns([3, 1])
294
+
295
+ with col1:
296
+ st.write(f"**Source:** {article['source']}")
297
+ if 'date' in article:
298
+ st.write(f"**Date:** {article['date']}")
299
+ st.write(f"**URL:** {article.get('url', 'N/A')}")
300
+
301
+ # Sentiment
302
+ sentiment = article['sentiment']
303
+ sentiment_label = "Positive" if sentiment['compound'] > 0.1 else "Negative" if sentiment['compound'] < -0.1 else "Neutral"
304
+ sentiment_color = "sentiment-positive" if sentiment_label == "Positive" else "sentiment-negative" if sentiment_label == "Negative" else "sentiment-neutral"
305
+ st.markdown(f"**Sentiment:** <span class='{sentiment_color}'>{sentiment_label} ({sentiment['compound']:.3f})</span>", unsafe_allow_html=True)
306
+
307
+ with col2:
308
+ # Model-specific scores
309
+ st.write("**Model Scores:**")
310
+ if 'vader' in sentiment:
311
+ st.write(f"VADER: {sentiment['vader']:.3f}")
312
+ if 'loughran_mcdonald' in sentiment:
313
+ st.write(f"L&M: {sentiment['loughran_mcdonald']:.3f}")
314
+ if 'finbert' in sentiment:
315
+ st.write(f"FinBERT: {sentiment['finbert']:.3f}")
316
+
317
+ # Summary
318
+ if 'summary' in article:
319
+ st.write("**Summary:**")
320
+ st.write(article['summary'])
321
+
322
+ # Multilingual summaries
323
+ if 'summaries' in article:
324
+ for lang, summary in article['summaries'].items():
325
+ if lang != 'English':
326
+ st.write(f"**Summary ({lang}):**")
327
+ st.write(summary)
328
+
329
+ def display_sentiment_analysis(results):
330
+ """Display detailed sentiment analysis"""
331
+ st.subheader("🎯 Detailed Sentiment Analysis")
332
+
333
+ # Model comparison
334
+ if results['articles']:
335
+ model_data = []
336
+ for article in results['articles']:
337
+ sentiment = article['sentiment']
338
+ row = {'title': article['title'][:30] + "..."}
339
+ if 'vader' in sentiment:
340
+ row['VADER'] = sentiment['vader']
341
+ if 'loughran_mcdonald' in sentiment:
342
+ row['Loughran-McDonald'] = sentiment['loughran_mcdonald']
343
+ if 'finbert' in sentiment:
344
+ row['FinBERT'] = sentiment['finbert']
345
+ row['Final Score'] = sentiment['compound']
346
+ model_data.append(row)
347
+
348
+ df_models = pd.DataFrame(model_data)
349
+ st.write("**Model Comparison:**")
350
+ st.dataframe(df_models, use_container_width=True)
351
+
352
+ # Correlation heatmap
353
+ numeric_cols = [col for col in df_models.columns if col != 'title']
354
+ if len(numeric_cols) > 1:
355
+ corr_matrix = df_models[numeric_cols].corr()
356
+ fig_heatmap = px.imshow(
357
+ corr_matrix,
358
+ text_auto=True,
359
+ aspect="auto",
360
+ color_continuous_scale="RdBu_r",
361
+ color_continuous_midpoint=0
362
+ )
363
+ fig_heatmap.update_layout(title="Model Correlation Matrix")
364
+ st.plotly_chart(fig_heatmap, use_container_width=True)
365
+
366
+ # Top positive and negative articles
367
+ col1, col2 = st.columns(2)
368
+
369
+ with col1:
370
+ st.write("**Most Positive Articles:**")
371
+ positive_articles = sorted(
372
+ results['articles'],
373
+ key=lambda x: x['sentiment']['compound'],
374
+ reverse=True
375
+ )[:5]
376
+
377
+ for article in positive_articles:
378
+ st.write(f"• {article['title'][:50]}... ({article['sentiment']['compound']:.3f})")
379
+
380
+ with col2:
381
+ st.write("**Most Negative Articles:**")
382
+ negative_articles = sorted(
383
+ results['articles'],
384
+ key=lambda x: x['sentiment']['compound']
385
+ )[:5]
386
+
387
+ for article in negative_articles:
388
+ st.write(f"• {article['title'][:50]}... ({article['sentiment']['compound']:.3f})")
389
+
390
+ def display_audio_summaries(results):
391
+ """Display audio summaries for different languages"""
392
+ st.subheader("🎵 Audio Summaries")
393
+
394
+ if 'audio_files' in results:
395
+ for lang, audio_file in results['audio_files'].items():
396
+ st.write(f"**{lang} Summary:**")
397
+
398
+ # Create audio player
399
+ if os.path.exists(audio_file):
400
+ with open(audio_file, 'rb') as audio_file_obj:
401
+ audio_bytes = audio_file_obj.read()
402
+ st.audio(audio_bytes, format='audio/mp3')
403
+ else:
404
+ st.write("Audio file not found")
405
+ else:
406
+ st.info("No audio summaries available. Enable audio generation in settings.")
407
+
408
+ def display_export_options(results):
409
+ """Display export options"""
410
+ st.subheader("📤 Export Results")
411
+
412
+ col1, col2, col3 = st.columns(3)
413
+
414
+ with col1:
415
+ # CSV Export
416
+ if st.button("📊 Download CSV", use_container_width=True):
417
+ csv_data = prepare_csv_export(results)
418
+ st.download_button(
419
+ label="Click to Download CSV",
420
+ data=csv_data,
421
+ file_name=f"news_analysis_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
422
+ mime="text/csv"
423
+ )
424
+
425
+ with col2:
426
+ # JSON Export
427
+ if st.button("📋 Download JSON", use_container_width=True):
428
+ json_data = json.dumps(results, indent=2, default=str)
429
+ st.download_button(
430
+ label="Click to Download JSON",
431
+ data=json_data,
432
+ file_name=f"news_analysis_{datetime.now().strftime('%Y%m%d_%H%M')}.json",
433
+ mime="application/json"
434
+ )
435
+
436
+ with col3:
437
+ # PDF Report
438
+ if st.button("📄 Generate PDF Report", use_container_width=True):
439
+ try:
440
+ pdf_buffer = generate_pdf_report(results)
441
+ st.download_button(
442
+ label="Click to Download PDF",
443
+ data=pdf_buffer,
444
+ file_name=f"news_analysis_report_{datetime.now().strftime('%Y%m%d_%H%M')}.pdf",
445
+ mime="application/pdf"
446
+ )
447
+ except Exception as e:
448
+ st.error(f"Error generating PDF: {str(e)}")
449
+
450
+ def display_api_info(results):
451
+ """Display API information and examples"""
452
+ st.subheader("🔌 API Access")
453
+
454
+ st.write("**Endpoint:** `/api/analyze`")
455
+ st.write("**Method:** GET")
456
+ st.write("**Parameters:**")
457
+ st.code("""
458
+ - query: string (required) - Company name, ticker, or keyword
459
+ - num_articles: integer (default: 20) - Number of articles to analyze
460
+ - languages: array (default: ["English"]) - Summary languages
461
+ - include_audio: boolean (default: true) - Generate audio summaries
462
+ - sentiment_models: array (default: ["VADER", "Loughran-McDonald", "FinBERT"]) - Models to use
463
+ """)
464
+
465
+ st.write("**Example Request:**")
466
+ st.code(f"GET /api/analyze?query={results['query']}&num_articles=20")
467
+
468
+ st.write("**Sample Response:**")
469
+ sample_response = {
470
+ "query": results['query'],
471
+ "total_articles": len(results['articles']),
472
+ "average_sentiment": results['summary']['average_sentiment'],
473
+ "articles": results['articles'][:2] # Show first 2 articles as example
474
+ }
475
+ st.json(sample_response)
476
+
477
+ def prepare_csv_export(results):
478
+ """Prepare CSV data for export"""
479
+ csv_data = []
480
+
481
+ for article in results['articles']:
482
+ row = {
483
+ 'title': article['title'],
484
+ 'source': article['source'],
485
+ 'url': article.get('url', ''),
486
+ 'date': article.get('date', ''),
487
+ 'sentiment_compound': article['sentiment']['compound'],
488
+ 'sentiment_label': 'Positive' if article['sentiment']['compound'] > 0.1 else 'Negative' if article['sentiment']['compound'] < -0.1 else 'Neutral',
489
+ 'summary': article.get('summary', '')
490
+ }
491
+
492
+ # Add model-specific scores
493
+ if 'vader' in article['sentiment']:
494
+ row['vader_score'] = article['sentiment']['vader']
495
+ if 'loughran_mcdonald' in article['sentiment']:
496
+ row['loughran_mcdonald_score'] = article['sentiment']['loughran_mcdonald']
497
+ if 'finbert' in article['sentiment']:
498
+ row['finbert_score'] = article['sentiment']['finbert']
499
+
500
+ csv_data.append(row)
501
+
502
+ df = pd.DataFrame(csv_data)
503
+ return df.to_csv(index=False)
504
+
505
+ def show_demo_dashboard():
506
+ """Show demo dashboard with sample data"""
507
+ st.header("🚀 Welcome to Global Business News Intelligence")
508
+
509
+ st.markdown("""
510
+ ### Key Features:
511
+ - **🔍 Multi-Source News Scraping:** Aggregates news from reliable sources
512
+ - **🎯 Advanced Sentiment Analysis:** Uses VADER, Loughran-McDonald, and FinBERT models
513
+ - **🌐 Multilingual Support:** Summaries in English, Hindi, and Tamil
514
+ - **🎵 Audio Generation:** Text-to-speech for all language summaries
515
+ - **📊 Interactive Dashboard:** Real-time charts and visualizations
516
+ - **📤 Multiple Export Formats:** CSV, JSON, and PDF reports
517
+ - **🔌 API Access:** Programmatic access to all features
518
+
519
+ ### Use Cases:
520
+ - **📈 Investment Research:** Track sentiment around stocks and companies
521
+ - **🏢 Brand Monitoring:** Monitor public perception of your brand
522
+ - **🔍 Market Intelligence:** Stay informed about industry trends
523
+ - **📰 Media Analysis:** Analyze coverage patterns across sources
524
+ - **🌍 Global Insights:** Access news in multiple languages
525
+
526
+ ### Get Started:
527
+ 1. Enter a company name, stock ticker, or keyword in the sidebar
528
+ 2. Configure your analysis settings
529
+ 3. Click "Analyze News" to start
530
+ 4. Explore results in the interactive dashboard
531
+ 5. Export your findings in multiple formats
532
+ """)
533
+
534
+ # Sample visualization
535
+ st.subheader("📊 Sample Analysis Dashboard")
536
+
537
+ # Create sample data
538
+ sample_data = {
539
+ 'Sentiment': ['Positive', 'Negative', 'Neutral'],
540
+ 'Count': [45, 15, 40]
541
+ }
542
+
543
+ fig = px.pie(
544
+ values=sample_data['Count'],
545
+ names=sample_data['Sentiment'],
546
+ color_discrete_map={'Positive': '#28a745', 'Negative': '#dc3545', 'Neutral': '#6c757d'},
547
+ title="Sample Sentiment Distribution"
548
+ )
549
+
550
+ col1, col2 = st.columns([1, 1])
551
+ with col1:
552
+ st.plotly_chart(fig, use_container_width=True)
553
+
554
+ with col2:
555
+ st.write("**Sample Metrics:**")
556
+ st.metric("Articles Analyzed", "100")
557
+ st.metric("Average Sentiment", "0.234")
558
+ st.metric("Sources Covered", "15")
559
+ st.metric("Languages", "3")
560
+
561
+ if __name__ == "__main__":
562
+ main()
summarizer_module.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Optional
3
+ import re
4
+ from transformers import pipeline, AutoTokenizer
5
+ import torch
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class TextSummarizer:
10
+ """Text summarization with chunking for long documents"""
11
+
12
+ def __init__(self):
13
+ self.summarizer = None
14
+ self.tokenizer = None
15
+ self.max_chunk_length = 1024 # Maximum tokens per chunk
16
+ self.max_summary_length = 150
17
+ self.min_summary_length = 50
18
+
19
+ self._initialize_model()
20
+ logger.info("TextSummarizer initialized")
21
+
22
+ def _initialize_model(self):
23
+ """Initialize the summarization model"""
24
+ try:
25
+ # Try different models in order of preference
26
+ model_names = [
27
+ "facebook/bart-large-cnn",
28
+ "sshleifer/distilbart-cnn-12-6",
29
+ "t5-small"
30
+ ]
31
+
32
+ for model_name in model_names:
33
+ try:
34
+ # Use CPU to avoid memory issues on Hugging Face Spaces
35
+ device = -1 # CPU only for Hugging Face Spaces
36
+
37
+ self.summarizer = pipeline(
38
+ "summarization",
39
+ model=model_name,
40
+ tokenizer=model_name,
41
+ device=device,
42
+ framework="pt"
43
+ )
44
+
45
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
46
+ logger.info(f"Successfully loaded summarization model: {model_name}")
47
+ break
48
+
49
+ except Exception as e:
50
+ logger.warning(f"Failed to load {model_name}: {str(e)}")
51
+ continue
52
+
53
+ if self.summarizer is None:
54
+ logger.error("Failed to load any summarization model")
55
+
56
+ except Exception as e:
57
+ logger.error(f"Error initializing summarizer: {str(e)}")
58
+
59
+ def summarize(self, text: str, max_length: int = None, min_length: int = None) -> str:
60
+ """Summarize text with automatic chunking for long documents"""
61
+ if not text or not text.strip():
62
+ return ""
63
+
64
+ if not self.summarizer:
65
+ return self._fallback_summarize(text)
66
+
67
+ try:
68
+ # Use provided lengths or defaults
69
+ max_len = max_length or self.max_summary_length
70
+ min_len = min_length or self.min_summary_length
71
+
72
+ # Check if text needs chunking
73
+ if self._needs_chunking(text):
74
+ return self._summarize_long_text(text, max_len, min_len)
75
+ else:
76
+ return self._summarize_chunk(text, max_len, min_len)
77
+
78
+ except Exception as e:
79
+ logger.error(f"Summarization failed: {str(e)}")
80
+ return self._fallback_summarize(text)
81
+
82
+ def _needs_chunking(self, text: str) -> bool:
83
+ """Check if text needs to be chunked"""
84
+ if not self.tokenizer:
85
+ return len(text.split()) > 300 # Rough word count threshold
86
+
87
+ try:
88
+ tokens = self.tokenizer.encode(text, add_special_tokens=True)
89
+ return len(tokens) > self.max_chunk_length
90
+ except:
91
+ return len(text.split()) > 300
92
+
93
+ def _summarize_long_text(self, text: str, max_len: int, min_len: int) -> str:
94
+ """Summarize long text by chunking"""
95
+ try:
96
+ # Split text into chunks
97
+ chunks = self._split_into_chunks(text)
98
+
99
+ if not chunks:
100
+ return self._fallback_summarize(text)
101
+
102
+ # Summarize each chunk
103
+ chunk_summaries = []
104
+ for chunk in chunks:
105
+ if len(chunk.strip()) > 100: # Only summarize substantial chunks
106
+ summary = self._summarize_chunk(
107
+ chunk,
108
+ max_length=min(max_len // len(chunks) + 20, 100),
109
+ min_length=20
110
+ )
111
+ if summary and summary.strip():
112
+ chunk_summaries.append(summary)
113
+
114
+ if not chunk_summaries:
115
+ return self._fallback_summarize(text)
116
+
117
+ # Combine chunk summaries
118
+ combined_summary = " ".join(chunk_summaries)
119
+
120
+ # If combined summary is still too long, summarize again
121
+ if self._needs_chunking(combined_summary) and len(chunk_summaries) > 1:
122
+ final_summary = self._summarize_chunk(combined_summary, max_len, min_len)
123
+ return final_summary if final_summary else combined_summary
124
+
125
+ return combined_summary
126
+
127
+ except Exception as e:
128
+ logger.error(f"Long text summarization failed: {str(e)}")
129
+ return self._fallback_summarize(text)
130
+
131
+ def _summarize_chunk(self, text: str, max_length: int, min_length: int) -> str:
132
+ """Summarize a single chunk of text"""
133
+ try:
134
+ if not text or len(text.strip()) < 50:
135
+ return text
136
+
137
+ # Clean text
138
+ cleaned_text = self._clean_text_for_summarization(text)
139
+
140
+ if not cleaned_text:
141
+ return text[:200] + "..." if len(text) > 200 else text
142
+
143
+ # Generate summary
144
+ result = self.summarizer(
145
+ cleaned_text,
146
+ max_length=max_length,
147
+ min_length=min_length,
148
+ do_sample=False,
149
+ truncation=True
150
+ )
151
+
152
+ if result and len(result) > 0 and 'summary_text' in result[0]:
153
+ summary = result[0]['summary_text'].strip()
154
+
155
+ # Post-process summary
156
+ summary = self._post_process_summary(summary)
157
+
158
+ return summary if summary else cleaned_text[:200] + "..."
159
+
160
+ return cleaned_text[:200] + "..."
161
+
162
+ except Exception as e:
163
+ logger.error(f"Chunk summarization failed: {str(e)}")
164
+ return text[:200] + "..." if len(text) > 200 else text
165
+
166
+ def _split_into_chunks(self, text: str) -> List[str]:
167
+ """Split text into manageable chunks"""
168
+ try:
169
+ # Split by paragraphs first
170
+ paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
171
+
172
+ if not paragraphs:
173
+ paragraphs = [text]
174
+
175
+ chunks = []
176
+ current_chunk = ""
177
+ current_length = 0
178
+
179
+ for paragraph in paragraphs:
180
+ paragraph_length = len(paragraph.split())
181
+
182
+ # If adding this paragraph would exceed chunk size, start new chunk
183
+ if current_length + paragraph_length > 250 and current_chunk:
184
+ chunks.append(current_chunk.strip())
185
+ current_chunk = paragraph
186
+ current_length = paragraph_length
187
+ else:
188
+ if current_chunk:
189
+ current_chunk += "\n\n" + paragraph
190
+ else:
191
+ current_chunk = paragraph
192
+ current_length += paragraph_length
193
+
194
+ # Add remaining chunk
195
+ if current_chunk.strip():
196
+ chunks.append(current_chunk.strip())
197
+
198
+ # If no proper chunks, split by sentences
199
+ if not chunks or len(chunks) == 1 and len(chunks[0].split()) > 400:
200
+ return self._split_by_sentences(text)
201
+
202
+ return chunks
203
+
204
+ except Exception as e:
205
+ logger.error(f"Text splitting failed: {str(e)}")
206
+ return [text]
207
+
208
+ def _split_by_sentences(self, text: str) -> List[str]:
209
+ """Split text by sentences as fallback"""
210
+ try:
211
+ sentences = re.split(r'[.!?]+\s+', text)
212
+ chunks = []
213
+ current_chunk = ""
214
+
215
+ for sentence in sentences:
216
+ if len((current_chunk + " " + sentence).split()) > 200:
217
+ if current_chunk:
218
+ chunks.append(current_chunk.strip())
219
+ current_chunk = sentence
220
+ else:
221
+ if current_chunk:
222
+ current_chunk += ". " + sentence
223
+ else:
224
+ current_chunk = sentence
225
+
226
+ if current_chunk.strip():
227
+ chunks.append(current_chunk.strip())
228
+
229
+ return chunks if chunks else [text]
230
+
231
+ except Exception as e:
232
+ logger.error(f"Sentence splitting failed: {str(e)}")
233
+ return [text]
234
+
235
+ def _clean_text_for_summarization(self, text: str) -> str:
236
+ """Clean text for better summarization"""
237
+ if not text:
238
+ return ""
239
+
240
+ # Remove URLs
241
+ text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
242
+
243
+ # Remove email addresses
244
+ text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
245
+
246
+ # Remove excessive whitespace
247
+ text = re.sub(r'\s+', ' ', text)
248
+
249
+ # Remove common news artifacts
250
+ artifacts = [
251
+ r'\(Reuters\)', r'\(AP\)', r'\(Bloomberg\)', r'\(CNN\)',
252
+ r'-- .*$', r'Photo:.*$', r'Image:.*$', r'Video:.*$',
253
+ r'Subscribe.*$', r'Follow us.*$'
254
+ ]
255
+
256
+ for artifact in artifacts:
257
+ text = re.sub(artifact, '', text, flags=re.IGNORECASE | re.MULTILINE)
258
+
259
+ return text.strip()
260
+
261
+ def _post_process_summary(self, summary: str) -> str:
262
+ """Post-process generated summary"""
263
+ if not summary:
264
+ return ""
265
+
266
+ # Remove incomplete sentences at the end
267
+ sentences = re.split(r'[.!?]+', summary)
268
+ if len(sentences) > 1 and len(sentences[-1].strip()) < 10:
269
+ summary = '.'.join(sentences[:-1]) + '.'
270
+
271
+ # Capitalize first letter
272
+ summary = summary[0].upper() + summary[1:] if len(summary) > 1 else summary.upper()
273
+
274
+ # Ensure summary ends with punctuation
275
+ if summary and summary[-1] not in '.!?':
276
+ summary += '.'
277
+
278
+ return summary.strip()
279
+
280
+ def _fallback_summarize(self, text: str) -> str:
281
+ """Fallback summarization using simple extraction"""
282
+ try:
283
+ if not text or len(text.strip()) < 50:
284
+ return text
285
+
286
+ # Split into sentences
287
+ sentences = re.split(r'[.!?]+', text)
288
+ sentences = [s.strip() for s in sentences if s.strip() and len(s.split()) > 5]
289
+
290
+ if not sentences:
291
+ return text[:200] + "..." if len(text) > 200 else text
292
+
293
+ # Take first few sentences (extractive summary)
294
+ num_sentences = min(3, len(sentences))
295
+ summary_sentences = sentences[:num_sentences]
296
+
297
+ summary = '. '.join(summary_sentences)
298
+ if not summary.endswith('.'):
299
+ summary += '.'
300
+
301
+ # If summary is too long, truncate
302
+ if len(summary) > 300:
303
+ words = summary.split()
304
+ summary = ' '.join(words[:40]) + '...'
305
+
306
+ return summary
307
+
308
+ except Exception as e:
309
+ logger.error(f"Fallback summarization failed: {str(e)}")
310
+ return text[:200] + "..." if len(text) > 200 else text
311
+
312
+ def batch_summarize(self, texts: List[str], **kwargs) -> List[str]:
313
+ """Summarize multiple texts"""
314
+ summaries = []
315
+
316
+ for text in texts:
317
+ try:
318
+ summary = self.summarize(text, **kwargs)
319
+ summaries.append(summary)
320
+ except Exception as e:
321
+ logger.error(f"Batch summarization failed for one text: {str(e)}")
322
+ summaries.append(self._fallback_summarize(text))
323
+
324
+ return summaries
325
+
326
+ def get_summary_stats(self, original_text: str, summary: str) -> dict:
327
+ """Get statistics about the summarization"""
328
+ try:
329
+ original_words = len(original_text.split())
330
+ summary_words = len(summary.split())
331
+
332
+ compression_ratio = summary_words / original_words if original_words > 0 else 0
333
+
334
+ return {
335
+ 'original_length': original_words,
336
+ 'summary_length': summary_words,
337
+ 'compression_ratio': compression_ratio,
338
+ 'compression_percentage': (1 - compression_ratio) * 100
339
+ }
340
+
341
+ except Exception as e:
342
+ logger.error(f"Error calculating summary stats: {str(e)}")
343
+ return {
344
+ 'original_length': 0,
345
+ 'summary_length': 0,
346
+ 'compression_ratio': 0,
347
+ 'compression_percentage': 0
348
+ }
349
+
350
+ # Utility functions
351
+ def extract_key_sentences(text: str, num_sentences: int = 3) -> List[str]:
352
+ """Extract key sentences using simple heuristics"""
353
+ try:
354
+ sentences = re.split(r'[.!?]+', text)
355
+ sentences = [s.strip() for s in sentences if s.strip() and len(s.split()) > 5]
356
+
357
+ if not sentences:
358
+ return []
359
+
360
+ # Score sentences based on position and keyword density
361
+ scored_sentences = []
362
+
363
+ for i, sentence in enumerate(sentences):
364
+ score = 0
365
+
366
+ # Position bonus (earlier sentences get higher scores)
367
+ if i < len(sentences) * 0.3:
368
+ score += 3
369
+ elif i < len(sentences) * 0.6:
370
+ score += 2
371
+ else:
372
+ score += 1
373
+
374
+ # Length bonus (medium-length sentences preferred)
375
+ words = len(sentence.split())
376
+ if 10 <= words <= 25:
377
+ score += 2
378
+ elif 5 <= words <= 35:
379
+ score += 1
380
+
381
+ # Keyword bonus (sentences with common business/finance terms)
382
+ keywords = [
383
+ 'company', 'business', 'revenue', 'profit', 'growth', 'market',
384
+ 'financial', 'earnings', 'investment', 'stock', 'shares', 'economy'
385
+ ]
386
+
387
+ sentence_lower = sentence.lower()
388
+ keyword_count = sum(1 for keyword in keywords if keyword in sentence_lower)
389
+ score += keyword_count
390
+
391
+ scored_sentences.append((sentence, score))
392
+
393
+ # Sort by score and return top sentences
394
+ scored_sentences.sort(key=lambda x: x[1], reverse=True)
395
+
396
+ return [sent[0] for sent in scored_sentences[:num_sentences]]
397
+
398
+ except Exception as e:
399
+ logger.error(f"Key sentence extraction failed: {str(e)}")
400
+ return []
translator_module (1).py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, List, Optional
3
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
4
+ import torch
5
+ import re
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class MultilingualTranslator:
10
+ """Multilingual translation with support for Hindi and Tamil"""
11
+
12
+ def __init__(self):
13
+ self.translators = {}
14
+ self.language_codes = {
15
+ 'Hindi': 'hi',
16
+ 'Tamil': 'ta',
17
+ 'English': 'en'
18
+ }
19
+
20
+ # Supported translation pairs
21
+ self.supported_pairs = {
22
+ 'en-hi': 'Helsinki-NLP/opus-mt-en-hi',
23
+ 'en-ta': 'Helsinki-NLP/opus-mt-en-mul', # Multilingual model for Tamil
24
+ 'hi-en': 'Helsinki-NLP/opus-mt-hi-en',
25
+ 'ta-en': 'Helsinki-NLP/opus-mt-mul-en'
26
+ }
27
+
28
+ self._initialize_models()
29
+ logger.info("MultilingualTranslator initialized")
30
+
31
+ def _initialize_models(self):
32
+ """Initialize translation models on-demand"""
33
+ # Don't load all models at startup to save memory
34
+ # They will be loaded when first needed
35
+ logger.info("Translation models will be loaded on-demand")
36
+
37
+ def _load_translator(self, source_lang: str, target_lang: str) -> Optional[object]:
38
+ """Load a specific translator model"""
39
+ pair_key = f"{source_lang}-{target_lang}"
40
+
41
+ if pair_key in self.translators:
42
+ return self.translators[pair_key]
43
+
44
+ try:
45
+ model_name = self.supported_pairs.get(pair_key)
46
+ if not model_name:
47
+ logger.error(f"No model available for {source_lang} -> {target_lang}")
48
+ return None
49
+
50
+ # Use CPU for Hugging Face Spaces compatibility
51
+ device = -1 # CPU only
52
+
53
+ translator = pipeline(
54
+ "translation",
55
+ model=model_name,
56
+ device=device,
57
+ framework="pt"
58
+ )
59
+
60
+ self.translators[pair_key] = translator
61
+ logger.info(f"Loaded translator for {source_lang} -> {target_lang}")
62
+
63
+ return translator
64
+
65
+ except Exception as e:
66
+ logger.error(f"Failed to load translator {pair_key}: {str(e)}")
67
+ return None
68
+
69
+ def translate(self, text: str, target_lang: str, source_lang: str = 'English') -> str:
70
+ """Translate text to target language"""
71
+ if not text or not text.strip():
72
+ return ""
73
+
74
+ # Get language codes
75
+ source_code = self.language_codes.get(source_lang, 'en')
76
+ target_code = self.language_codes.get(target_lang, target_lang.lower()[:2])
77
+
78
+ # If source and target are the same, return original text
79
+ if source_code == target_code:
80
+ return text
81
+
82
+ try:
83
+ # Load the appropriate translator
84
+ translator = self._load_translator(source_code, target_code)
85
+
86
+ if not translator:
87
+ return self._fallback_translate(text, target_lang)
88
+
89
+ # Clean and prepare text
90
+ cleaned_text = self._prepare_text_for_translation(text)
91
+
92
+ if not cleaned_text:
93
+ return text
94
+
95
+ # Split long text into chunks for translation
96
+ if len(cleaned_text.split()) > 200:
97
+ return self._translate_long_text(cleaned_text, translator)
98
+ else:
99
+ return self._translate_chunk(cleaned_text, translator)
100
+
101
+ except Exception as e:
102
+ logger.error(f"Translation failed: {str(e)}")
103
+ return self._fallback_translate(text, target_lang)
104
+
105
+ def _translate_chunk(self, text: str, translator) -> str:
106
+ """Translate a single chunk of text"""
107
+ try:
108
+ result = translator(text, max_length=512)
109
+
110
+ if result and len(result) > 0:
111
+ translated = result[0].get('translation_text', text)
112
+ return self._post_process_translation(translated)
113
+
114
+ return text
115
+
116
+ except Exception as e:
117
+ logger.error(f"Chunk translation failed: {str(e)}")
118
+ return text
119
+
120
+ def _translate_long_text(self, text: str, translator) -> str:
121
+ """Translate long text by splitting into chunks"""
122
+ try:
123
+ # Split by sentences
124
+ sentences = self._split_into_sentences(text)
125
+
126
+ if not sentences:
127
+ return text
128
+
129
+ translated_sentences = []
130
+ current_chunk = ""
131
+
132
+ for sentence in sentences:
133
+ # If adding this sentence would make chunk too long, translate current chunk
134
+ if len((current_chunk + " " + sentence).split()) > 150 and current_chunk:
135
+ translated = self._translate_chunk(current_chunk, translator)
136
+ translated_sentences.append(translated)
137
+ current_chunk = sentence
138
+ else:
139
+ if current_chunk:
140
+ current_chunk += " " + sentence
141
+ else:
142
+ current_chunk = sentence
143
+
144
+ # Translate remaining chunk
145
+ if current_chunk:
146
+ translated = self._translate_chunk(current_chunk, translator)
147
+ translated_sentences.append(translated)
148
+
149
+ return " ".join(translated_sentences)
150
+
151
+ except Exception as e:
152
+ logger.error(f"Long text translation failed: {str(e)}")
153
+ return text
154
+
155
+ def _split_into_sentences(self, text: str) -> List[str]:
156
+ """Split text into sentences"""
157
+ try:
158
+ # Simple sentence splitting
159
+ sentences = re.split(r'[.!?]+\s+', text)
160
+ sentences = [s.strip() for s in sentences if s.strip()]
161
+
162
+ return sentences
163
+
164
+ except Exception as e:
165
+ logger.error(f"Sentence splitting failed: {str(e)}")
166
+ return [text]
167
+
168
+ def _prepare_text_for_translation(self, text: str) -> str:
169
+ """Prepare text for translation"""
170
+ if not text:
171
+ return ""
172
+
173
+ # Remove URLs
174
+ text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
175
+
176
+ # Remove email addresses
177
+ text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
178
+
179
+ # Clean excessive whitespace
180
+ text = re.sub(r'\s+', ' ', text)
181
+
182
+ # Remove special characters that might cause issues
183
+ text = re.sub(r'[^\w\s.,!?;:\-\'"()/%$]', '', text)
184
+
185
+ return text.strip()
186
+
187
+ def _post_process_translation(self, text: str) -> str:
188
+ """Post-process translated text"""
189
+ if not text:
190
+ return ""
191
+
192
+ # Clean up extra spaces
193
+ text = re.sub(r'\s+', ' ', text)
194
+
195
+ # Capitalize first letter if it's a sentence
196
+ if text and len(text) > 1:
197
+ text = text[0].upper() + text[1:]
198
+
199
+ return text.strip()
200
+
201
+ def _fallback_translate(self, text: str, target_lang: str) -> str:
202
+ """Fallback translation with basic text processing"""
203
+ logger.warning(f"Using fallback translation for {target_lang}")
204
+
205
+ # For demonstration purposes, we'll return the original text with a note
206
+ # In a production system, you might use a different translation service
207
+ if target_lang.lower() in ['hindi', 'hi']:
208
+ return f"[Hindi] {text}"
209
+ elif target_lang.lower() in ['tamil', 'ta']:
210
+ return f"[Tamil] {text}"
211
+ else:
212
+ return text
213
+
214
+ def batch_translate(self, texts: List[str], target_lang: str, source_lang: str = 'English') -> List[str]:
215
+ """Translate multiple texts"""
216
+ translations = []
217
+
218
+ for text in texts:
219
+ try:
220
+ translation = self.translate(text, target_lang, source_lang)
221
+ translations.append(translation)
222
+ except Exception as e:
223
+ logger.error(f"Batch translation failed for one text: {str(e)}")
224
+ translations.append(self._fallback_translate(text, target_lang))
225
+
226
+ return translations
227
+
228
+ def detect_language(self, text: str) -> str:
229
+ """Simple language detection (basic implementation)"""
230
+ try:
231
+ # Basic detection using character patterns
232
+ if not text:
233
+ return 'en'
234
+
235
+ # Check for Devanagari script (Hindi)
236
+ if re.search(r'[\u0900-\u097F]', text):
237
+ return 'hi'
238
+
239
+ # Check for Tamil script
240
+ if re.search(r'[\u0B80-\u0BFF]', text):
241
+ return 'ta'
242
+
243
+ # Default to English
244
+ return 'en'
245
+
246
+ except Exception as e:
247
+ logger.error(f"Language detection failed: {str(e)}")
248
+ return 'en'
249
+
250
+ def get_supported_languages(self) -> List[str]:
251
+ """Get list of supported languages"""
252
+ return list(self.language_codes.keys())
253
+
254
+ def is_translation_available(self, source_lang: str, target_lang: str) -> bool:
255
+ """Check if translation is available between two languages"""
256
+ source_code = self.language_codes.get(source_lang, source_lang.lower()[:2])
257
+ target_code = self.language_codes.get(target_lang, target_lang.lower()[:2])
258
+
259
+ pair_key = f"{source_code}-{target_code}"
260
+ return pair_key in self.supported_pairs
261
+
262
+ def translate_with_confidence(self, text: str, target_lang: str, source_lang: str = 'English') -> Dict[str, any]:
263
+ """Translate text and return result with confidence metrics"""
264
+ try:
265
+ translated_text = self.translate(text, target_lang, source_lang)
266
+
267
+ # Simple confidence calculation based on text characteristics
268
+ confidence = self._calculate_translation_confidence(text, translated_text, target_lang)
269
+
270
+ return {
271
+ 'original_text': text,
272
+ 'translated_text': translated_text,
273
+ 'source_language': source_lang,
274
+ 'target_language': target_lang,
275
+ 'confidence': confidence,
276
+ 'method': 'neural_translation' if translated_text != text else 'fallback'
277
+ }
278
+
279
+ except Exception as e:
280
+ logger.error(f"Translation with confidence failed: {str(e)}")
281
+ return {
282
+ 'original_text': text,
283
+ 'translated_text': text,
284
+ 'source_language': source_lang,
285
+ 'target_language': target_lang,
286
+ 'confidence': 0.0,
287
+ 'method': 'error',
288
+ 'error': str(e)
289
+ }
290
+
291
+ def _calculate_translation_confidence(self, original: str, translated: str, target_lang: str) -> float:
292
+ """Calculate a simple confidence score for translation"""
293
+ try:
294
+ # If translation failed (same as original), low confidence
295
+ if original == translated and target_lang != 'English':
296
+ return 0.2
297
+
298
+ # If text is very short, moderate confidence
299
+ if len(original.split()) < 5:
300
+ return 0.7
301
+
302
+ # If translation is significantly different in length, lower confidence
303
+ original_len = len(original.split())
304
+ translated_len = len(translated.split())
305
+
306
+ length_ratio = min(original_len, translated_len) / max(original_len, translated_len)
307
+
308
+ if length_ratio < 0.5:
309
+ return 0.6
310
+ elif length_ratio < 0.7:
311
+ return 0.8
312
+ else:
313
+ return 0.9
314
+
315
+ except Exception as e:
316
+ logger.error(f"Confidence calculation failed: {str(e)}")
317
+ return 0.5
318
+
319
+ # Utility functions
320
+ def get_language_name(code: str) -> str:
321
+ """Get full language name from code"""
322
+ code_to_name = {
323
+ 'en': 'English',
324
+ 'hi': 'Hindi',
325
+ 'ta': 'Tamil'
326
+ }
327
+ return code_to_name.get(code.lower(), code)
328
+
329
+ def get_language_code(name: str) -> str:
330
+ """Get language code from name"""
331
+ name_to_code = {
332
+ 'english': 'en',
333
+ 'hindi': 'hi',
334
+ 'tamil': 'ta'
335
+ }
336
+ return name_to_code.get(name.lower(), name.lower()[:2])
tts_module.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import tempfile
4
+ from typing import Dict, List, Optional
5
+ import hashlib
6
+ from datetime import datetime
7
+
8
+ # gTTS for text-to-speech
9
+ try:
10
+ from gtts import gTTS
11
+ GTTS_AVAILABLE = True
12
+ except ImportError:
13
+ GTTS_AVAILABLE = False
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class AudioGenerator:
18
+ """Text-to-speech audio generation with multilingual support"""
19
+
20
+ def __init__(self):
21
+ self.supported_languages = {
22
+ 'English': 'en',
23
+ 'Hindi': 'hi',
24
+ 'Tamil': 'ta'
25
+ }
26
+
27
+ # Audio cache directory
28
+ self.cache_dir = tempfile.mkdtemp(prefix='news_audio_')
29
+ self.audio_cache = {}
30
+
31
+ logger.info(f"AudioGenerator initialized with cache directory: {self.cache_dir}")
32
+
33
+ if not GTTS_AVAILABLE:
34
+ logger.warning("gTTS not available. Audio generation will be limited.")
35
+
36
+ def generate_audio(self, text: str, language: str = 'English', output_file: str = None) -> Optional[str]:
37
+ """Generate audio from text"""
38
+ if not text or not text.strip():
39
+ logger.warning("Empty text provided for audio generation")
40
+ return None
41
+
42
+ if not GTTS_AVAILABLE:
43
+ logger.error("gTTS not available for audio generation")
44
+ return None
45
+
46
+ try:
47
+ # Get language code
48
+ lang_code = self.supported_languages.get(language, 'en')
49
+
50
+ # Create cache key
51
+ cache_key = self._create_cache_key(text, language)
52
+
53
+ # Check cache first
54
+ if cache_key in self.audio_cache:
55
+ cached_file = self.audio_cache[cache_key]
56
+ if os.path.exists(cached_file):
57
+ logger.info(f"Using cached audio for {language}")
58
+ return cached_file
59
+
60
+ # Generate output filename if not provided
61
+ if not output_file:
62
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
63
+ output_file = os.path.join(self.cache_dir, f"audio_{lang_code}_{timestamp}.mp3")
64
+ elif not os.path.dirname(output_file):
65
+ output_file = os.path.join(self.cache_dir, output_file)
66
+
67
+ # Prepare text for TTS
68
+ clean_text = self._prepare_text_for_tts(text)
69
+
70
+ if not clean_text:
71
+ logger.warning("No valid text for TTS after cleaning")
72
+ return None
73
+
74
+ # Generate audio using gTTS
75
+ if lang_code in ['en', 'hi']:
76
+ # gTTS supports English and Hindi directly
77
+ tts = gTTS(text=clean_text, lang=lang_code, slow=False)
78
+ elif lang_code == 'ta':
79
+ # For Tamil, use English as fallback or try Tamil if available
80
+ try:
81
+ tts = gTTS(text=clean_text, lang='ta', slow=False)
82
+ except:
83
+ logger.warning("Tamil not supported in gTTS, using English")
84
+ tts = gTTS(text=clean_text, lang='en', slow=False)
85
+ else:
86
+ # Default to English
87
+ tts = gTTS(text=clean_text, lang='en', slow=False)
88
+
89
+ # Save audio file
90
+ tts.save(output_file)
91
+
92
+ # Verify file was created
93
+ if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
94
+ # Cache the result
95
+ self.audio_cache[cache_key] = output_file
96
+
97
+ logger.info(f"Audio generated successfully: {output_file}")
98
+ return output_file
99
+ else:
100
+ logger.error("Audio file was not created or is empty")
101
+ return None
102
+
103
+ except Exception as e:
104
+ logger.error(f"Audio generation failed: {str(e)}")
105
+ return None
106
+
107
+ def _create_cache_key(self, text: str, language: str) -> str:
108
+ """Create a cache key for the text and language combination"""
109
+ try:
110
+ combined = f"{text[:500]}_{language}" # Use first 500 chars to avoid very long keys
111
+ return hashlib.md5(combined.encode()).hexdigest()
112
+ except Exception as e:
113
+ logger.error(f"Cache key creation failed: {str(e)}")
114
+ return f"default_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
115
+
116
+ def _prepare_text_for_tts(self, text: str) -> str:
117
+ """Prepare text for text-to-speech conversion"""
118
+ if not text:
119
+ return ""
120
+
121
+ # Remove or replace problematic characters
122
+ import re
123
+
124
+ # Remove URLs
125
+ text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
126
+
127
+ # Remove email addresses
128
+ text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
129
+
130
+ # Replace multiple spaces with single space
131
+ text = re.sub(r'\s+', ' ', text)
132
+
133
+ # Remove excessive punctuation
134
+ text = re.sub(r'[.]{3,}', '...', text)
135
+ text = re.sub(r'[!]{2,}', '!', text)
136
+ text = re.sub(r'[?]{2,}', '?', text)
137
+
138
+ # Remove parenthetical citations and references
139
+ text = re.sub(r'\([^)]*\)', '', text)
140
+ text = re.sub(r'\[[^\]]*\]', '', text)
141
+
142
+ # Limit text length for TTS (gTTS has limits)
143
+ max_length = 5000 # Characters
144
+ if len(text) > max_length:
145
+ # Try to cut at sentence boundary
146
+ sentences = re.split(r'[.!?]+', text[:max_length])
147
+ if len(sentences) > 1:
148
+ text = '. '.join(sentences[:-1]) + '.'
149
+ else:
150
+ text = text[:max_length] + '...'
151
+
152
+ return text.strip()
153
+
154
+ def generate_batch_audio(self, texts: Dict[str, str], language: str = 'English') -> Dict[str, str]:
155
+ """Generate audio for multiple texts"""
156
+ results = {}
157
+
158
+ for key, text in texts.items():
159
+ try:
160
+ output_file = f"audio_{key}_{language.lower()}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
161
+ audio_file = self.generate_audio(text, language, output_file)
162
+ results[key] = audio_file
163
+ except Exception as e:
164
+ logger.error(f"Batch audio generation failed for {key}: {str(e)}")
165
+ results[key] = None
166
+
167
+ return results
168
+
169
+ def generate_summary_audio(self, articles: List[Dict], languages: List[str] = None) -> Dict[str, str]:
170
+ """Generate audio summaries for articles in multiple languages"""
171
+ if languages is None:
172
+ languages = ['English']
173
+
174
+ audio_files = {}
175
+
176
+ try:
177
+ # Create overall summary text
178
+ summary_text = self._create_audio_summary(articles)
179
+
180
+ if not summary_text:
181
+ logger.warning("No summary text created for audio")
182
+ return audio_files
183
+
184
+ # Generate audio for each language
185
+ for language in languages:
186
+ if language in self.supported_languages:
187
+ try:
188
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
189
+ output_file = f"summary_{language.lower()}_{timestamp}.mp3"
190
+
191
+ audio_file = self.generate_audio(summary_text, language, output_file)
192
+
193
+ if audio_file:
194
+ audio_files[language] = audio_file
195
+ else:
196
+ logger.warning(f"Failed to generate audio for {language}")
197
+
198
+ except Exception as e:
199
+ logger.error(f"Audio generation failed for {language}: {str(e)}")
200
+ continue
201
+ else:
202
+ logger.warning(f"Language {language} not supported for audio")
203
+
204
+ return audio_files
205
+
206
+ except Exception as e:
207
+ logger.error(f"Summary audio generation failed: {str(e)}")
208
+ return audio_files
209
+
210
+ def _create_audio_summary(self, articles: List[Dict]) -> str:
211
+ """Create a comprehensive audio summary from articles"""
212
+ try:
213
+ if not articles:
214
+ return ""
215
+
216
+ # Calculate sentiment distribution
217
+ positive_count = sum(1 for article in articles if article.get('sentiment', {}).get('compound', 0) > 0.1)
218
+ negative_count = sum(1 for article in articles if article.get('sentiment', {}).get('compound', 0) < -0.1)
219
+ neutral_count = len(articles) - positive_count - negative_count
220
+
221
+ # Start building summary
222
+ summary_parts = []
223
+
224
+ # Opening
225
+ summary_parts.append(f"News analysis summary for {len(articles)} articles.")
226
+
227
+ # Sentiment overview
228
+ if positive_count > negative_count:
229
+ summary_parts.append(f"Overall sentiment is predominantly positive, with {positive_count} positive articles, {negative_count} negative, and {neutral_count} neutral.")
230
+ elif negative_count > positive_count:
231
+ summary_parts.append(f"Overall sentiment is predominantly negative, with {negative_count} negative articles, {positive_count} positive, and {neutral_count} neutral.")
232
+ else:
233
+ summary_parts.append(f"Sentiment is mixed with balanced coverage across {positive_count} positive, {negative_count} negative, and {neutral_count} neutral articles.")
234
+
235
+ # Top stories
236
+ # Most positive story
237
+ positive_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0), reverse=True)
238
+ if positive_articles and positive_articles[0].get('sentiment', {}).get('compound', 0) > 0.1:
239
+ top_positive = positive_articles[0]
240
+ summary_parts.append(f"Most positive coverage: {top_positive.get('title', '')[:100]}")
241
+
242
+ # Most negative story
243
+ negative_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0))
244
+ if negative_articles and negative_articles[0].get('sentiment', {}).get('compound', 0) < -0.1:
245
+ top_negative = negative_articles[0]
246
+ summary_parts.append(f"Most concerning coverage: {top_negative.get('title', '')[:100]}")
247
+
248
+ # Recent developments (if we have dates)
249
+ recent_articles = [a for a in articles if a.get('date')]
250
+ if recent_articles:
251
+ recent_articles.sort(key=lambda x: x.get('date', ''), reverse=True)
252
+ if len(recent_articles) > 0:
253
+ summary_parts.append(f"Latest development: {recent_articles[0].get('title', '')[:100]}")
254
+
255
+ # Closing
256
+ summary_parts.append("This concludes the news analysis summary.")
257
+
258
+ # Join all parts
259
+ full_summary = " ".join(summary_parts)
260
+
261
+ # Ensure reasonable length
262
+ if len(full_summary) > 1000:
263
+ # Truncate to first few sentences
264
+ sentences = full_summary.split('. ')
265
+ truncated = '. '.join(sentences[:8]) + '.'
266
+ return truncated
267
+
268
+ return full_summary
269
+
270
+ except Exception as e:
271
+ logger.error(f"Audio summary creation failed: {str(e)}")
272
+ return f"Analysis complete for {len(articles)} articles with mixed sentiment coverage."
273
+
274
+ def cleanup_cache(self, max_age_hours: int = 24):
275
+ """Clean up old audio files from cache"""
276
+ try:
277
+ if not os.path.exists(self.cache_dir):
278
+ return
279
+
280
+ current_time = datetime.now().timestamp()
281
+ max_age_seconds = max_age_hours * 3600
282
+
283
+ removed_count = 0
284
+
285
+ for filename in os.listdir(self.cache_dir):
286
+ filepath = os.path.join(self.cache_dir, filename)
287
+
288
+ if os.path.isfile(filepath):
289
+ file_age = current_time - os.path.getmtime(filepath)
290
+
291
+ if file_age > max_age_seconds:
292
+ try:
293
+ os.remove(filepath)
294
+ removed_count += 1
295
+
296
+ # Remove from cache dict as well
297
+ cache_keys_to_remove = [k for k, v in self.audio_cache.items() if v == filepath]
298
+ for key in cache_keys_to_remove:
299
+ del self.audio_cache[key]
300
+
301
+ except Exception as e:
302
+ logger.error(f"Failed to remove old audio file {filepath}: {str(e)}")
303
+
304
+ if removed_count > 0:
305
+ logger.info(f"Cleaned up {removed_count} old audio files")
306
+
307
+ except Exception as e:
308
+ logger.error(f"Cache cleanup failed: {str(e)}")
309
+
310
+ def get_cache_info(self) -> Dict[str, any]:
311
+ """Get information about the audio cache"""
312
+ try:
313
+ cache_info = {
314
+ 'cache_directory': self.cache_dir,
315
+ 'cached_files': len(self.audio_cache),
316
+ 'supported_languages': list(self.supported_languages.keys()),
317
+ 'gtts_available': GTTS_AVAILABLE
318
+ }
319
+
320
+ if os.path.exists(self.cache_dir):
321
+ files = [f for f in os.listdir(self.cache_dir) if f.endswith('.mp3')]
322
+ cache_info['physical_files'] = len(files)
323
+
324
+ total_size = sum(os.path.getsize(os.path.join(self.cache_dir, f)) for f in files)
325
+ cache_info['total_size_bytes'] = total_size
326
+ cache_info['total_size_mb'] = round(total_size / (1024 * 1024), 2)
327
+
328
+ return cache_info
329
+
330
+ except Exception as e:
331
+ logger.error(f"Cache info retrieval failed: {str(e)}")
332
+ return {'error': str(e)}
333
+
334
+ def is_language_supported(self, language: str) -> bool:
335
+ """Check if a language is supported for audio generation"""
336
+ return language in self.supported_languages and GTTS_AVAILABLE
utils_module (1).py ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import json
4
+ import pickle
5
+ import hashlib
6
+ from datetime import datetime, timedelta
7
+ from typing import Dict, Any, Optional, List
8
+ import tempfile
9
+ import sys
10
+
11
+ def setup_logging():
12
+ """Setup logging configuration"""
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
16
+ handlers=[
17
+ logging.StreamHandler(sys.stdout),
18
+ logging.FileHandler('news_analyzer.log')
19
+ ]
20
+ )
21
+
22
+ # Reduce noise from transformers and other libraries
23
+ logging.getLogger("transformers").setLevel(logging.WARNING)
24
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
25
+ logging.getLogger("requests").setLevel(logging.WARNING)
26
+
27
+ def load_config() -> Dict[str, Any]:
28
+ """Load application configuration"""
29
+ default_config = {
30
+ 'max_articles': 50,
31
+ 'cache_ttl_hours': 6,
32
+ 'supported_languages': ['English', 'Hindi', 'Tamil'],
33
+ 'sentiment_models': ['VADER', 'Loughran-McDonald', 'FinBERT'],
34
+ 'summarization_max_length': 150,
35
+ 'summarization_min_length': 50,
36
+ 'audio_enabled': True,
37
+ 'translation_enabled': True,
38
+ 'keyword_extraction_enabled': True,
39
+ 'max_keywords': 20,
40
+ 'debug_mode': False
41
+ }
42
+
43
+ # Try to load config from file if it exists
44
+ config_file = 'config.json'
45
+ if os.path.exists(config_file):
46
+ try:
47
+ with open(config_file, 'r') as f:
48
+ file_config = json.load(f)
49
+ default_config.update(file_config)
50
+ except Exception as e:
51
+ logging.error(f"Failed to load config file: {str(e)}")
52
+
53
+ return default_config
54
+
55
+ class CacheManager:
56
+ """Simple file-based caching system"""
57
+
58
+ def __init__(self, cache_dir: str = None):
59
+ self.cache_dir = cache_dir or tempfile.mkdtemp(prefix='news_cache_')
60
+ self.ensure_cache_dir()
61
+
62
+ logging.info(f"Cache manager initialized with directory: {self.cache_dir}")
63
+
64
+ def ensure_cache_dir(self):
65
+ """Ensure cache directory exists"""
66
+ try:
67
+ os.makedirs(self.cache_dir, exist_ok=True)
68
+ except Exception as e:
69
+ logging.error(f"Failed to create cache directory: {str(e)}")
70
+
71
+ def _get_cache_key(self, key: str) -> str:
72
+ """Generate a safe cache key"""
73
+ return hashlib.md5(key.encode()).hexdigest()
74
+
75
+ def get(self, key: str, ttl_hours: int = 6) -> Optional[Any]:
76
+ """Get item from cache"""
77
+ try:
78
+ cache_key = self._get_cache_key(key)
79
+ cache_file = os.path.join(self.cache_dir, f"{cache_key}.pkl")
80
+
81
+ if not os.path.exists(cache_file):
82
+ return None
83
+
84
+ # Check if cache is expired
85
+ file_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
86
+ if file_age > ttl_hours * 3600:
87
+ try:
88
+ os.remove(cache_file)
89
+ except:
90
+ pass
91
+ return None
92
+
93
+ # Load cached data
94
+ with open(cache_file, 'rb') as f:
95
+ data = pickle.load(f)
96
+
97
+ logging.debug(f"Cache hit for key: {key[:50]}...")
98
+ return data
99
+
100
+ except Exception as e:
101
+ logging.error(f"Cache get failed for key {key}: {str(e)}")
102
+ return None
103
+
104
+ def set(self, key: str, value: Any) -> bool:
105
+ """Set item in cache"""
106
+ try:
107
+ cache_key = self._get_cache_key(key)
108
+ cache_file = os.path.join(self.cache_dir, f"{cache_key}.pkl")
109
+
110
+ with open(cache_file, 'wb') as f:
111
+ pickle.dump(value, f)
112
+
113
+ logging.debug(f"Cache set for key: {key[:50]}...")
114
+ return True
115
+
116
+ except Exception as e:
117
+ logging.error(f"Cache set failed for key {key}: {str(e)}")
118
+ return False
119
+
120
+ def clear_expired(self, ttl_hours: int = 24):
121
+ """Clear expired cache entries"""
122
+ try:
123
+ current_time = datetime.now().timestamp()
124
+ max_age = ttl_hours * 3600
125
+ cleared_count = 0
126
+
127
+ for filename in os.listdir(self.cache_dir):
128
+ if filename.endswith('.pkl'):
129
+ filepath = os.path.join(self.cache_dir, filename)
130
+ file_age = current_time - os.path.getmtime(filepath)
131
+
132
+ if file_age > max_age:
133
+ try:
134
+ os.remove(filepath)
135
+ cleared_count += 1
136
+ except Exception as e:
137
+ logging.error(f"Failed to remove cache file {filepath}: {str(e)}")
138
+
139
+ if cleared_count > 0:
140
+ logging.info(f"Cleared {cleared_count} expired cache entries")
141
+
142
+ except Exception as e:
143
+ logging.error(f"Cache cleanup failed: {str(e)}")
144
+
145
+ # Global cache instance
146
+ cache_manager = CacheManager()
147
+
148
+ def cache_results(func):
149
+ """Decorator for caching function results"""
150
+ def wrapper(*args, **kwargs):
151
+ # Create cache key from function name and arguments
152
+ cache_key = f"{func.__name__}_{str(args)}_{str(kwargs)}"
153
+
154
+ # Try to get from cache
155
+ cached_result = cache_manager.get(cache_key)
156
+ if cached_result is not None:
157
+ return cached_result
158
+
159
+ # Execute function and cache result
160
+ result = func(*args, **kwargs)
161
+ cache_manager.set(cache_key, result)
162
+
163
+ return result
164
+
165
+ return wrapper
166
+
167
+ def validate_input(text: str, min_length: int = 10, max_length: int = 10000) -> bool:
168
+ """Validate input text"""
169
+ if not text or not isinstance(text, str):
170
+ return False
171
+
172
+ text = text.strip()
173
+ if len(text) < min_length or len(text) > max_length:
174
+ return False
175
+
176
+ return True
177
+
178
+ def sanitize_filename(filename: str) -> str:
179
+ """Sanitize filename for safe file system usage"""
180
+ import re
181
+
182
+ # Replace invalid characters
183
+ sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
184
+
185
+ # Remove extra spaces and dots
186
+ sanitized = re.sub(r'\s+', '_', sanitized)
187
+ sanitized = re.sub(r'\.+', '.', sanitized)
188
+
189
+ # Limit length
190
+ if len(sanitized) > 200:
191
+ sanitized = sanitized[:200]
192
+
193
+ return sanitized
194
+
195
+ def format_datetime(dt: datetime = None) -> str:
196
+ """Format datetime for display"""
197
+ if dt is None:
198
+ dt = datetime.now()
199
+
200
+ return dt.strftime("%Y-%m-%d %H:%M:%S")
201
+
202
+ def calculate_processing_stats(start_time: datetime, num_articles: int) -> Dict[str, Any]:
203
+ """Calculate processing statistics"""
204
+ end_time = datetime.now()
205
+ processing_time = (end_time - start_time).total_seconds()
206
+
207
+ return {
208
+ 'start_time': format_datetime(start_time),
209
+ 'end_time': format_datetime(end_time),
210
+ 'processing_time_seconds': processing_time,
211
+ 'processing_time_formatted': f"{processing_time:.2f} seconds",
212
+ 'articles_processed': num_articles,
213
+ 'articles_per_second': round(num_articles / processing_time, 2) if processing_time > 0 else 0
214
+ }
215
+
216
+ def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
217
+ """Split text into overlapping chunks"""
218
+ if len(text) <= chunk_size:
219
+ return [text]
220
+
221
+ chunks = []
222
+ start = 0
223
+
224
+ while start < len(text):
225
+ end = start + chunk_size
226
+
227
+ # If this isn't the last chunk, try to break at a sentence boundary
228
+ if end < len(text):
229
+ # Look for sentence boundaries in the last 100 characters
230
+ last_part = text[end-100:end]
231
+ sentence_end = max(
232
+ last_part.rfind('.'),
233
+ last_part.rfind('!'),
234
+ last_part.rfind('?')
235
+ )
236
+
237
+ if sentence_end != -1:
238
+ end = end - 100 + sentence_end + 1
239
+
240
+ chunks.append(text[start:end].strip())
241
+ start = end - overlap
242
+
243
+ return [chunk for chunk in chunks if chunk.strip()]
244
+
245
+ def extract_domain(url: str) -> str:
246
+ """Extract domain from URL"""
247
+ try:
248
+ from urllib.parse import urlparse
249
+ parsed = urlparse(url)
250
+ return parsed.netloc.replace('www.', '')
251
+ except Exception:
252
+ return 'unknown'
253
+
254
+ def safe_divide(a: float, b: float, default: float = 0.0) -> float:
255
+ """Safely divide two numbers"""
256
+ try:
257
+ return a / b if b != 0 else default
258
+ except (TypeError, ZeroDivisionError):
259
+ return default
260
+
261
+ def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
262
+ """Truncate text to specified length"""
263
+ if not text or len(text) <= max_length:
264
+ return text
265
+
266
+ return text[:max_length - len(suffix)] + suffix
267
+
268
+ def get_file_size_mb(filepath: str) -> float:
269
+ """Get file size in MB"""
270
+ try:
271
+ size_bytes = os.path.getsize(filepath)
272
+ return round(size_bytes / (1024 * 1024), 2)
273
+ except Exception:
274
+ return 0.0
275
+
276
+ def ensure_directory(directory: str):
277
+ """Ensure directory exists"""
278
+ try:
279
+ os.makedirs(directory, exist_ok=True)
280
+ except Exception as e:
281
+ logging.error(f"Failed to create directory {directory}: {str(e)}")
282
+
283
+ def load_json_file(filepath: str) -> Optional[Dict]:
284
+ """Load JSON file safely"""
285
+ try:
286
+ with open(filepath, 'r', encoding='utf-8') as f:
287
+ return json.load(f)
288
+ except Exception as e:
289
+ logging.error(f"Failed to load JSON file {filepath}: {str(e)}")
290
+ return None
291
+
292
+ def save_json_file(data: Dict, filepath: str) -> bool:
293
+ """Save data to JSON file safely"""
294
+ try:
295
+ ensure_directory(os.path.dirname(filepath))
296
+ with open(filepath, 'w', encoding='utf-8') as f:
297
+ json.dump(data, f, indent=2, default=str)
298
+ return True
299
+ except Exception as e:
300
+ logging.error(f"Failed to save JSON file {filepath}: {str(e)}")
301
+ return False
302
+
303
+ def merge_dictionaries(*dicts) -> Dict:
304
+ """Merge multiple dictionaries"""
305
+ result = {}
306
+ for d in dicts:
307
+ if isinstance(d, dict):
308
+ result.update(d)
309
+ return result
310
+
311
+ def get_system_info() -> Dict[str, Any]:
312
+ """Get basic system information"""
313
+ import platform
314
+ import psutil
315
+
316
+ try:
317
+ return {
318
+ 'platform': platform.platform(),
319
+ 'python_version': platform.python_version(),
320
+ 'cpu_count': os.cpu_count(),
321
+ 'memory_gb': round(psutil.virtual_memory().total / (1024**3), 2),
322
+ 'available_memory_gb': round(psutil.virtual_memory().available / (1024**3), 2),
323
+ 'disk_space_gb': round(psutil.disk_usage('/').total / (1024**3), 2)
324
+ }
325
+ except Exception as e:
326
+ logging.error(f"Failed to get system info: {str(e)}")
327
+ return {'error': str(e)}
328
+
329
+ def format_number(num: float, precision: int = 2) -> str:
330
+ """Format number for display"""
331
+ try:
332
+ if abs(num) >= 1_000_000:
333
+ return f"{num / 1_000_000:.{precision}f}M"
334
+ elif abs(num) >= 1_000:
335
+ return f"{num / 1_000:.{precision}f}K"
336
+ else:
337
+ return f"{num:.{precision}f}"
338
+ except Exception:
339
+ return str(num)
340
+
341
+ def calculate_sentiment_distribution(articles: List[Dict]) -> Dict[str, Any]:
342
+ """Calculate sentiment distribution statistics"""
343
+ try:
344
+ if not articles:
345
+ return {'positive': 0, 'negative': 0, 'neutral': 0, 'total': 0}
346
+
347
+ sentiments = []
348
+ for article in articles:
349
+ sentiment = article.get('sentiment', {})
350
+ compound = sentiment.get('compound', 0)
351
+ sentiments.append(compound)
352
+
353
+ positive_count = sum(1 for s in sentiments if s > 0.1)
354
+ negative_count = sum(1 for s in sentiments if s < -0.1)
355
+ neutral_count = len(sentiments) - positive_count - negative_count
356
+
357
+ avg_sentiment = sum(sentiments) / len(sentiments) if sentiments else 0
358
+
359
+ return {
360
+ 'positive': positive_count,
361
+ 'negative': negative_count,
362
+ 'neutral': neutral_count,
363
+ 'total': len(articles),
364
+ 'average_sentiment': round(avg_sentiment, 3),
365
+ 'positive_percentage': round((positive_count / len(articles)) * 100, 1),
366
+ 'negative_percentage': round((negative_count / len(articles)) * 100, 1),
367
+ 'neutral_percentage': round((neutral_count / len(articles)) * 100, 1)
368
+ }
369
+
370
+ except Exception as e:
371
+ logging.error(f"Sentiment distribution calculation failed: {str(e)}")
372
+ return {'positive': 0, 'negative': 0, 'neutral': 0, 'total': 0}
373
+
374
+ def create_progress_callback(progress_container=None):
375
+ """Create a progress callback function for Streamlit"""
376
+ def callback(progress: int, status: str):
377
+ if progress_container:
378
+ try:
379
+ progress_container.progress(progress)
380
+ if hasattr(progress_container, 'text'):
381
+ progress_container.text(status)
382
+ except Exception as e:
383
+ logging.error(f"Progress callback error: {str(e)}")
384
+ else:
385
+ logging.info(f"Progress: {progress}% - {status}")
386
+
387
+ return callback
388
+
389
+ def validate_url(url: str) -> bool:
390
+ """Validate if string is a valid URL"""
391
+ import re
392
+
393
+ url_pattern = re.compile(
394
+ r'^https?://' # http:// or https://
395
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
396
+ r'localhost|' # localhost...
397
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
398
+ r'(?::\d+)?' # optional port
399
+ r'(?:/?|[/?]\S+), re.IGNORECASE)
400
+
401
+ return url_pattern.match(url) is not None
402
+
403
+ class PerformanceTimer:
404
+ """Context manager for timing operations"""
405
+
406
+ def __init__(self, operation_name: str = "Operation"):
407
+ self.operation_name = operation_name
408
+ self.start_time = None
409
+ self.end_time = None
410
+
411
+ def __enter__(self):
412
+ self.start_time = datetime.now()
413
+ logging.info(f"Starting {self.operation_name}")
414
+ return self
415
+
416
+ def __exit__(self, exc_type, exc_val, exc_tb):
417
+ self.end_time = datetime.now()
418
+ duration = (self.end_time - self.start_time).total_seconds()
419
+ logging.info(f"Completed {self.operation_name} in {duration:.2f} seconds")
420
+
421
+ @property
422
+ def duration(self) -> float:
423
+ if self.start_time and self.end_time:
424
+ return (self.end_time - self.start_time).total_seconds()
425
+ return 0.0
426
+
427
+ def retry_operation(func, max_attempts: int = 3, delay: float = 1.0):
428
+ """Retry an operation with exponential backoff"""
429
+ import time
430
+
431
+ for attempt in range(max_attempts):
432
+ try:
433
+ return func()
434
+ except Exception as e:
435
+ if attempt == max_attempts - 1:
436
+ raise e
437
+
438
+ wait_time = delay * (2 ** attempt)
439
+ logging.warning(f"Attempt {attempt + 1} failed: {str(e)}. Retrying in {wait_time} seconds...")
440
+ time.sleep(wait_time)
441
+
442
+ return None