Spaces:

wekey1998
/

news-sentiment-project

Sleeping

App Files Files Community

wekey1998 commited on Aug 11

Commit

8f8d0f6

verified ·

1 Parent(s): f29b750

Upload 13 files

Browse files

Files changed (13) hide show

api_backend (1).py +355 -0
config_json.json +51 -0
dockerfile.txt +46 -0
gitattributes_file.txt +33 -0
nlp_module (1).py +464 -0
report_module (1).py +606 -0
requirements_file.txt +47 -0
scraper_module.py +396 -0
streamlit_app.py +562 -0
summarizer_module.py +400 -0
translator_module (1).py +336 -0
tts_module.py +336 -0
utils_module (1).py +442 -0

api_backend (1).py ADDED Viewed

	@@ -0,0 +1,355 @@

+from fastapi import FastAPI, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import List, Optional, Dict, Any
+import asyncio
+import logging
+from datetime import datetime
+import json
+# Import our modules
+from scraper import NewsletterScraper
+from nlp import SentimentAnalyzer, KeywordExtractor
+from summarizer import TextSummarizer
+from translator import MultilingualTranslator
+from tts import AudioGenerator
+from utils import setup_logging, cache_results
+# Setup logging
+setup_logging()
+logger = logging.getLogger(__name__)
+# FastAPI app
+app = FastAPI(
+    title="Global Business News Intelligence API",
+    description="Advanced news analysis with sentiment, summarization, and multilingual support",
+    version="1.0.0"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class AnalysisRequest(BaseModel):
+    query: str
+    num_articles: int = 20
+    languages: List[str] = ["English"]
+    include_audio: bool = True
+    sentiment_models: List[str] = ["VADER", "Loughran-McDonald", "FinBERT"]
+class AnalysisResponse(BaseModel):
+    query: str
+    total_articles: int
+    processing_time: float
+    average_sentiment: float
+    sentiment_distribution: Dict[str, int]
+    articles: List[Dict[str, Any]]
+    keywords: List[Dict[str, Any]]
+    summary: Dict[str, Any]
+    languages: List[str]
+    audio_files: Optional[Dict[str, str]] = None
+class NewsAnalyzer:
+    """Main news analysis orchestrator"""
+    def __init__(self):
+        self.scraper = NewsletterScraper()
+        self.sentiment_analyzer = SentimentAnalyzer()
+        self.keyword_extractor = KeywordExtractor()
+        self.summarizer = TextSummarizer()
+        self.translator = MultilingualTranslator()
+        self.audio_generator = AudioGenerator()
+        logger.info("NewsAnalyzer initialized successfully")
+    async def analyze_news_async(self, config: Dict[str, Any], progress_callback=None) -> Dict[str, Any]:
+        """Async version of analyze_news"""
+        return self.analyze_news(config, progress_callback)
+    def analyze_news(self, config: Dict[str, Any], progress_callback=None) -> Dict[str, Any]:
+        """Main analysis pipeline"""
+        start_time = datetime.now()
+        try:
+            query = config['query']
+            num_articles = config.get('num_articles', 20)
+            languages = config.get('languages', ['English'])
+            include_audio = config.get('include_audio', True)
+            sentiment_models = config.get('sentiment_models', ['VADER', 'Loughran-McDonald', 'FinBERT'])
+            logger.info(f"Starting analysis for query: {query}")
+            if progress_callback:
+                progress_callback(10, "Scraping articles...")
+            # Step 1: Scrape articles
+            articles = self.scraper.scrape_news(query, num_articles)
+            logger.info(f"Scraped {len(articles)} articles")
+            if not articles:
+                raise ValueError("No articles found for the given query")
+            if progress_callback:
+                progress_callback(30, "Analyzing sentiment...")
+            # Step 2: Sentiment analysis
+            for article in articles:
+                article['sentiment'] = self.sentiment_analyzer.analyze_sentiment(
+                    article['content'],
+                    models=sentiment_models
+                )
+            if progress_callback:
+                progress_callback(50, "Extracting keywords...")
+            # Step 3: Keyword extraction
+            all_text = ' '.join([article['content'] for article in articles])
+            keywords = self.keyword_extractor.extract_keywords(all_text)
+            if progress_callback:
+                progress_callback(60, "Generating summaries...")
+            # Step 4: Summarization
+            for article in articles:
+                article['summary'] = self.summarizer.summarize(article['content'])
+                # Multilingual summaries
+                if len(languages) > 1:
+                    article['summaries'] = {}
+                    for lang in languages:
+                        if lang != 'English':
+                            article['summaries'][lang] = self.translator.translate(
+                                article['summary'],
+                                target_lang=lang
+                            )
+                        else:
+                            article['summaries'][lang] = article['summary']
+            if progress_callback:
+                progress_callback(80, "Generating audio...")
+            # Step 5: Audio generation
+            audio_files = {}
+            if include_audio and languages:
+                # Create overall summary for audio
+                overall_summary = self.create_overall_summary(articles, keywords)
+                for lang in languages:
+                    if lang in ['English', 'Hindi', 'Tamil']:
+                        try:
+                            if lang != 'English':
+                                summary_text = self.translator.translate(overall_summary, target_lang=lang)
+                            else:
+                                summary_text = overall_summary
+                            audio_file = self.audio_generator.generate_audio(
+                                summary_text,
+                                language=lang,
+                                output_file=f"summary_{lang.lower()}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
+                            )
+                            audio_files[lang] = audio_file
+                        except Exception as e:
+                            logger.error(f"Error generating audio for {lang}: {str(e)}")
+            if progress_callback:
+                progress_callback(90, "Finalizing results...")
+            # Step 6: Calculate summary statistics
+            sentiments = [article['sentiment']['compound'] for article in articles]
+            average_sentiment = sum(sentiments) / len(sentiments) if sentiments else 0.0
+            sentiment_distribution = {
+                'Positive': sum(1 for s in sentiments if s > 0.1),
+                'Negative': sum(1 for s in sentiments if s < -0.1),
+                'Neutral': sum(1 for s in sentiments if -0.1 <= s <= 0.1)
+            }
+            # Step 7: Prepare results
+            processing_time = (datetime.now() - start_time).total_seconds()
+            results = {
+                'query': query,
+                'total_articles': len(articles),
+                'processing_time': processing_time,
+                'average_sentiment': average_sentiment,
+                'sentiment_distribution': sentiment_distribution,
+                'articles': articles,
+                'keywords': keywords,
+                'languages': languages,
+                'audio_files': audio_files,
+                'summary': {
+                    'average_sentiment': average_sentiment,
+                    'total_articles': len(articles),
+                    'sources': len(set([article['source'] for article in articles])),
+                    'date_range': self.get_date_range(articles)
+                }
+            }
+            if progress_callback:
+                progress_callback(100, "Analysis complete!")
+            logger.info(f"Analysis completed successfully in {processing_time:.2f} seconds")
+            return results
+        except Exception as e:
+            logger.error(f"Error in analysis pipeline: {str(e)}")
+            raise e
+    def create_overall_summary(self, articles: List[Dict], keywords: List[Dict]) -> str:
+        """Create an overall summary for audio generation"""
+        try:
+            # Get top keywords
+            top_keywords = [kw['keyword'] for kw in keywords[:10]]
+            # Calculate sentiment distribution
+            positive_count = sum(1 for article in articles if article['sentiment']['compound'] > 0.1)
+            negative_count = sum(1 for article in articles if article['sentiment']['compound'] < -0.1)
+            neutral_count = len(articles) - positive_count - negative_count
+            # Create summary text
+            summary = f"Analysis of {len(articles)} articles reveals "
+            if positive_count > negative_count:
+                summary += f"predominantly positive sentiment with {positive_count} positive, {negative_count} negative, and {neutral_count} neutral articles. "
+            elif negative_count > positive_count:
+                summary += f"predominantly negative sentiment with {negative_count} negative, {positive_count} positive, and {neutral_count} neutral articles. "
+            else:
+                summary += f"mixed sentiment with balanced coverage. "
+            if top_keywords:
+                summary += f"Key topics include: {', '.join(top_keywords[:5])}. "
+            # Add top stories
+            top_positive = sorted(articles, key=lambda x: x['sentiment']['compound'], reverse=True)[:2]
+            top_negative = sorted(articles, key=lambda x: x['sentiment']['compound'])[:2]
+            if top_positive[0]['sentiment']['compound'] > 0.1:
+                summary += f"Most positive coverage: {top_positive[0]['title'][:100]}. "
+            if top_negative[0]['sentiment']['compound'] < -0.1:
+                summary += f"Most concerning coverage: {top_negative[0]['title'][:100]}. "
+            return summary
+        except Exception as e:
+            logger.error(f"Error creating overall summary: {str(e)}")
+            return f"Analysis of {len(articles)} articles completed successfully."
+    def get_date_range(self, articles: List[Dict]) -> Dict[str, str]:
+        """Get the date range of articles"""
+        try:
+            dates = [article['date'] for article in articles if 'date' in article and article['date']]
+            if dates:
+                dates = [d for d in dates if d is not None]
+                if dates:
+                    min_date = min(dates)
+                    max_date = max(dates)
+                    return {
+                        'start': str(min_date),
+                        'end': str(max_date)
+                    }
+            return {'start': 'Unknown', 'end': 'Unknown'}
+        except Exception as e:
+            logger.error(f"Error getting date range: {str(e)}")
+            return {'start': 'Unknown', 'end': 'Unknown'}
+# Initialize the analyzer
+analyzer = NewsAnalyzer()
+# FastAPI endpoints
+@app.get("/", response_model=Dict[str, str])
+async def root():
+    """API root endpoint"""
+    return {
+        "message": "Global Business News Intelligence API",
+        "version": "1.0.0",
+        "docs": "/docs"
+    }
+@app.get("/health", response_model=Dict[str, str])
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "timestamp": datetime.now().isoformat()}
+@app.get("/api/analyze", response_model=AnalysisResponse)
+async def analyze_news_endpoint(
+    query: str = Query(..., description="Company name, ticker, or keyword to analyze"),
+    num_articles: int = Query(20, description="Number of articles to analyze (5-50)", ge=5, le=50),
+    languages: List[str] = Query(["English"], description="Languages for summaries"),
+    include_audio: bool = Query(True, description="Generate audio summaries"),
+    sentiment_models: List[str] = Query(["VADER", "Loughran-McDonald", "FinBERT"], description="Sentiment models to use")
+):
+    """Main analysis endpoint"""
+    try:
+        config = {
+            'query': query,
+            'num_articles': num_articles,
+            'languages': languages,
+            'include_audio': include_audio,
+            'sentiment_models': sentiment_models
+        }
+        results = await analyzer.analyze_news_async(config)
+        return AnalysisResponse(**results)
+    except Exception as e:
+        logger.error(f"Error in analyze endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/analyze", response_model=AnalysisResponse)
+async def analyze_news_post(request: AnalysisRequest):
+    """POST version of analysis endpoint"""
+    try:
+        config = request.dict()
+        results = await analyzer.analyze_news_async(config)
+        return AnalysisResponse(**results)
+    except Exception as e:
+        logger.error(f"Error in analyze POST endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/sources", response_model=List[str])
+async def get_available_sources():
+    """Get list of available news sources"""
+    return analyzer.scraper.get_available_sources()
+@app.get("/api/models", response_model=Dict[str, List[str]])
+async def get_available_models():
+    """Get list of available models"""
+    return {
+        "sentiment_models": ["VADER", "Loughran-McDonald", "FinBERT"],
+        "summarization_models": ["distilbart-cnn-12-6"],
+        "translation_models": ["Helsinki-NLP/opus-mt-en-hi", "Helsinki-NLP/opus-mt-en-fi"],
+        "audio_languages": ["English", "Hindi", "Tamil"]
+    }
+@app.get("/api/keywords/{query}", response_model=List[Dict[str, Any]])
+async def extract_keywords_endpoint(
+    query: str,
+    num_keywords: int = Query(20, description="Number of keywords to extract", ge=5, le=50)
+):
+    """Extract keywords from a query or text"""
+    try:
+        # For demo purposes, we'll scrape a few articles and extract keywords
+        articles = analyzer.scraper.scrape_news(query, 5)
+        if not articles:
+            raise HTTPException(status_code=404, detail="No articles found for query")
+        all_text = ' '.join([article['content'] for article in articles])
+        keywords = analyzer.keyword_extractor.extract_keywords(all_text, num_keywords=num_keywords)
+        return keywords
+    except Exception as e:
+        logger.error(f"Error in keywords endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

config_json.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "max_articles": 50,
+  "cache_ttl_hours": 6,
+  "supported_languages": ["English", "Hindi", "Tamil"],
+  "sentiment_models": ["VADER", "Loughran-McDonald", "FinBERT"],
+  "summarization_max_length": 150,
+  "summarization_min_length": 50,
+  "audio_enabled": true,
+  "translation_enabled": true,
+  "keyword_extraction_enabled": true,
+  "max_keywords": 20,
+  "debug_mode": false,
+  "huggingface_space_config": {
+    "title": "Global Business News Intelligence Dashboard",
+    "emoji": "📊",
+    "colorFrom": "blue",
+    "colorTo": "green",
+    "sdk": "streamlit",
+    "sdk_version": "1.28.1",
+    "app_file": "app.py",
+    "pinned": false,
+    "license": "mit"
+  },
+  "api_config": {
+    "host": "0.0.0.0",
+    "port": 8000,
+    "reload": false,
+    "workers": 1
+  },
+  "performance_settings": {
+    "max_concurrent_requests": 10,
+    "request_timeout_seconds": 300,
+    "memory_limit_gb": 4,
+    "cpu_optimization": true
+  },
+  "news_sources": {
+    "google_news": true,
+    "reuters": true,
+    "bbc": true,
+    "cnbc": true,
+    "bloomberg": true,
+    "marketwatch": true,
+    "financial_times": false
+  },
+  "model_settings": {
+    "use_cpu_only": true,
+    "model_cache_dir": "./model_cache",
+    "download_models_on_startup": false,
+    "optimize_for_inference": true
+  }
+}

dockerfile.txt ADDED Viewed

	@@ -0,0 +1,46 @@

+# Use official Python runtime as base image
+FROM python:3.9-slim
+# Set working directory
+WORKDIR /app
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV STREAMLIT_SERVER_HEADLESS=true
+ENV STREAMLIT_SERVER_PORT=7860
+ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
+ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Download NLTK data
+RUN python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('punkt'); nltk.download('stopwords')"
+# Copy application code
+COPY . .
+# Create necessary directories
+RUN mkdir -p logs cache model_cache temp
+# Expose port
+EXPOSE 7860
+# Health check
+HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
+# Run application
+CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

gitattributes_file.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text

nlp_module (1).py ADDED Viewed

	@@ -0,0 +1,464 @@

+import re
+import string
+import logging
+from typing import Dict, List, Any, Optional
+import pandas as pd
+import numpy as np
+from collections import Counter
+# NLTK imports
+import nltk
+try:
+    from nltk.sentiment import SentimentIntensityAnalyzer
+    from nltk.corpus import stopwords
+    from nltk.tokenize import word_tokenize, sent_tokenize
+    from nltk.stem import PorterStemmer
+except ImportError:
+    pass
+# Download required NLTK data
+try:
+    nltk.download('vader_lexicon', quiet=True)
+    nltk.download('punkt', quiet=True)
+    nltk.download('stopwords', quiet=True)
+except:
+    pass
+# Transformers for FinBERT
+try:
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+    import torch
+except ImportError:
+    pass
+# YAKE for keyword extraction
+try:
+    import yake
+except ImportError:
+    pass
+logger = logging.getLogger(__name__)
+class SentimentAnalyzer:
+    """Multi-model sentiment analysis"""
+    def __init__(self):
+        self.vader_analyzer = None
+        self.finbert_pipeline = None
+        self.loughran_mcdonald_dict = None
+        self._initialize_models()
+        logger.info("SentimentAnalyzer initialized")
+    def _initialize_models(self):
+        """Initialize all sentiment analysis models"""
+        # VADER
+        try:
+            self.vader_analyzer = SentimentIntensityAnalyzer()
+            logger.info("VADER model loaded")
+        except Exception as e:
+            logger.error(f"Failed to load VADER: {str(e)}")
+        # FinBERT
+        try:
+            model_name = "ProsusAI/finbert"
+            self.finbert_pipeline = pipeline(
+                "sentiment-analysis",
+                model=model_name,
+                tokenizer=model_name,
+                device=0 if torch.cuda.is_available() else -1
+            )
+            logger.info("FinBERT model loaded")
+        except Exception as e:
+            logger.warning(f"Failed to load FinBERT, using CPU fallback: {str(e)}")
+            try:
+                model_name = "ProsusAI/finbert"
+                self.finbert_pipeline = pipeline(
+                    "sentiment-analysis",
+                    model=model_name,
+                    tokenizer=model_name,
+                    device=-1
+                )
+                logger.info("FinBERT model loaded on CPU")
+            except Exception as e2:
+                logger.error(f"Failed to load FinBERT completely: {str(e2)}")
+        # Loughran-McDonald Dictionary
+        try:
+            self.loughran_mcdonald_dict = self._load_loughran_mcdonald()
+            logger.info("Loughran-McDonald dictionary loaded")
+        except Exception as e:
+            logger.error(f"Failed to load Loughran-McDonald dictionary: {str(e)}")
+    def _load_loughran_mcdonald(self) -> Dict[str, List[str]]:
+        """Load Loughran-McDonald financial sentiment dictionary"""
+        # Simplified version with key financial sentiment words
+        return {
+            'positive': [
+                'profit', 'profitable', 'profitability', 'revenue', 'revenues', 'growth',
+                'growing', 'increase', 'increased', 'increasing', 'success', 'successful',
+                'gain', 'gains', 'benefit', 'benefits', 'improvement', 'improved', 'strong',
+                'stronger', 'excellent', 'outstanding', 'exceed', 'exceeded', 'exceeds',
+                'beat', 'beats', 'positive', 'optimistic', 'bullish', 'rise', 'rising',
+                'surge', 'surged', 'boom', 'booming', 'expand', 'expansion', 'opportunity',
+                'opportunities', 'advance', 'advances', 'achievement', 'achieve', 'winner'
+            ],
+            'negative': [
+                'loss', 'losses', 'lose', 'losing', 'decline', 'declining', 'decrease',
+                'decreased', 'decreasing', 'fall', 'falling', 'drop', 'dropped', 'plunge',
+                'plunged', 'crash', 'crashed', 'failure', 'failed', 'weak', 'weakness',
+                'poor', 'worse', 'worst', 'bad', 'terrible', 'crisis', 'problem', 'problems',
+                'risk', 'risks', 'risky', 'concern', 'concerns', 'worried', 'worry',
+                'negative', 'pessimistic', 'bearish', 'bankruptcy', 'bankrupt', 'deficit',
+                'debt', 'lawsuit', 'sue', 'sued', 'investigation', 'fraud', 'scandal',
+                'volatility', 'volatile', 'uncertainty', 'uncertain', 'challenge', 'challenges'
+            ]
+        }
+    def analyze_sentiment(self, text: str, models: List[str] = None) -> Dict[str, Any]:
+        """Analyze sentiment using multiple models"""
+        if models is None:
+            models = ['VADER', 'Loughran-McDonald', 'FinBERT']
+        results = {}
+        # Clean text
+        cleaned_text = self._clean_text(text)
+        # VADER Analysis
+        if 'VADER' in models and self.vader_analyzer:
+            try:
+                vader_scores = self.vader_analyzer.polarity_scores(cleaned_text)
+                results['vader'] = vader_scores['compound']
+                results['vader_detailed'] = vader_scores
+            except Exception as e:
+                logger.error(f"VADER analysis failed: {str(e)}")
+                results['vader'] = 0.0
+        # Loughran-McDonald Analysis
+        if 'Loughran-McDonald' in models and self.loughran_mcdonald_dict:
+            try:
+                lm_score = self._analyze_loughran_mcdonald(cleaned_text)
+                results['loughran_mcdonald'] = lm_score
+            except Exception as e:
+                logger.error(f"Loughran-McDonald analysis failed: {str(e)}")
+                results['loughran_mcdonald'] = 0.0
+        # FinBERT Analysis
+        if 'FinBERT' in models and self.finbert_pipeline:
+            try:
+                # Truncate text for FinBERT (max 512 tokens)
+                truncated_text = cleaned_text[:2000]  # Approximate token limit
+                finbert_result = self.finbert_pipeline(truncated_text)[0]
+                # Convert to numerical score
+                label = finbert_result['label'].lower()
+                confidence = finbert_result['score']
+                if label == 'positive':
+                    finbert_score = confidence
+                elif label == 'negative':
+                    finbert_score = -confidence
+                else:  # neutral
+                    finbert_score = 0.0
+                results['finbert'] = finbert_score
+                results['finbert_detailed'] = finbert_result
+            except Exception as e:
+                logger.error(f"FinBERT analysis failed: {str(e)}")
+                results['finbert'] = 0.0
+        # Calculate composite score
+        scores = []
+        weights = {'vader': 0.3, 'loughran_mcdonald': 0.4, 'finbert': 0.3}
+        for model in ['vader', 'loughran_mcdonald', 'finbert']:
+            if model in results:
+                scores.append(results[model] * weights[model])
+        results['compound'] = sum(scores) if scores else 0.0
+        return results
+    def _clean_text(self, text: str) -> str:
+        """Clean text for sentiment analysis"""
+        if not text:
+            return ""
+        # Remove URLs
+        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
+        # Remove email addresses
+        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
+        # Remove extra whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove special characters but keep basic punctuation
+        text = re.sub(r'[^\w\s.,!?;:\-\'"()]', '', text)
+        return text.strip()
+    def _analyze_loughran_mcdonald(self, text: str) -> float:
+        """Analyze sentiment using Loughran-McDonald dictionary"""
+        try:
+            words = word_tokenize(text.lower())
+            positive_count = sum(1 for word in words if word in self.loughran_mcdonald_dict['positive'])
+            negative_count = sum(1 for word in words if word in self.loughran_mcdonald_dict['negative'])
+            total_sentiment_words = positive_count + negative_count
+            if total_sentiment_words == 0:
+                return 0.0
+            # Calculate normalized score
+            score = (positive_count - negative_count) / len(words) * 10  # Scale factor
+            # Clamp to [-1, 1] range
+            return max(-1.0, min(1.0, score))
+        except Exception as e:
+            logger.error(f"Loughran-McDonald calculation error: {str(e)}")
+            return 0.0
+class KeywordExtractor:
+    """Extract important keywords from text using YAKE"""
+    def __init__(self):
+        self.stop_words = set()
+        try:
+            self.stop_words = set(stopwords.words('english'))
+        except:
+            # Fallback stop words
+            self.stop_words = {
+                'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+                'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have',
+                'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
+                'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'
+            }
+        logger.info("KeywordExtractor initialized")
+    def extract_keywords(self, text: str, num_keywords: int = 20) -> List[Dict[str, Any]]:
+        """Extract keywords using YAKE algorithm"""
+        try:
+            # Use YAKE if available
+            if 'yake' in globals():
+                return self._extract_with_yake(text, num_keywords)
+            else:
+                return self._extract_with_frequency(text, num_keywords)
+        except Exception as e:
+            logger.error(f"Keyword extraction failed: {str(e)}")
+            return []
+    def _extract_with_yake(self, text: str, num_keywords: int) -> List[Dict[str, Any]]:
+        """Extract keywords using YAKE algorithm"""
+        try:
+            # YAKE configuration
+            kw_extractor = yake.KeywordExtractor(
+                lan="en",
+                n=3,  # n-gram size
+                dedupLim=0.9,
+                top=num_keywords,
+                features=None
+            )
+            keywords = kw_extractor.extract_keywords(text)
+            # Convert to desired format (lower score = more relevant in YAKE)
+            result = []
+            for keyword, score in keywords:
+                result.append({
+                    'keyword': keyword,
+                    'score': 1.0 / (1.0 + score),  # Invert score so higher = more relevant
+                    'relevance': 'high' if score < 0.1 else 'medium' if score < 0.3 else 'low'
+                })
+            return result
+        except Exception as e:
+            logger.error(f"YAKE extraction failed: {str(e)}")
+            return self._extract_with_frequency(text, num_keywords)
+    def _extract_with_frequency(self, text: str, num_keywords: int) -> List[Dict[str, Any]]:
+        """Fallback keyword extraction using frequency analysis"""
+        try:
+            # Clean and tokenize
+            words = word_tokenize(text.lower())
+            # Filter words
+            filtered_words = [
+                word for word in words
+                if (word not in self.stop_words and
+                    word not in string.punctuation and
+                    len(word) > 2 and
+                    word.isalpha())
+            ]
+            # Count frequencies
+            word_freq = Counter(filtered_words)
+            # Get top keywords
+            top_words = word_freq.most_common(num_keywords)
+            # Calculate relevance scores
+            max_freq = top_words[0][1] if top_words else 1
+            result = []
+            for word, freq in top_words:
+                score = freq / max_freq
+                result.append({
+                    'keyword': word,
+                    'score': score,
+                    'relevance': 'high' if score > 0.7 else 'medium' if score > 0.3 else 'low'
+                })
+            return result
+        except Exception as e:
+            logger.error(f"Frequency extraction failed: {str(e)}")
+            return []
+class TextProcessor:
+    """Text preprocessing and cleaning utilities"""
+    def __init__(self):
+        self.stemmer = PorterStemmer()
+        logger.info("TextProcessor initialized")
+    def clean_article_content(self, content: str) -> str:
+        """Clean article content by removing boilerplate"""
+        if not content:
+            return ""
+        # Remove common boilerplate patterns
+        boilerplate_patterns = [
+            r'Subscribe to our newsletter.*',
+            r'Sign up for.*',
+            r'Follow us on.*',
+            r'Copyright.*',
+            r'All rights reserved.*',
+            r'Terms of use.*',
+            r'Privacy policy.*',
+            r'Cookie policy.*',
+            r'\d+ comments?',
+            r'Share this article.*',
+            r'Related articles?.*',
+            r'More from.*',
+            r'Advertisement.*',
+            r'Sponsored content.*'
+        ]
+        cleaned_content = content
+        for pattern in boilerplate_patterns:
+            cleaned_content = re.sub(pattern, '', cleaned_content, flags=re.IGNORECASE)
+        # Remove extra whitespace
+        cleaned_content = re.sub(r'\s+', ' ', cleaned_content)
+        # Remove very short sentences (likely navigation/boilerplate)
+        sentences = sent_tokenize(cleaned_content)
+        meaningful_sentences = [
+            sent for sent in sentences
+            if len(sent.split()) > 5 and not self._is_boilerplate_sentence(sent)
+        ]
+        return ' '.join(meaningful_sentences).strip()
+    def _is_boilerplate_sentence(self, sentence: str) -> bool:
+        """Check if sentence is likely boilerplate"""
+        boilerplate_indicators = [
+            'click here', 'read more', 'subscribe', 'follow us', 'contact us',
+            'terms of service', 'privacy policy', 'copyright', 'all rights reserved',
+            'advertisement', 'sponsored', 'related articles'
+        ]
+        sentence_lower = sentence.lower()
+        return any(indicator in sentence_lower for indicator in boilerplate_indicators)
+    def extract_entities(self, text: str) -> Dict[str, List[str]]:
+        """Extract named entities (companies, people, locations)"""
+        # Simple regex-based entity extraction
+        entities = {
+            'companies': [],
+            'people': [],
+            'locations': [],
+            'money': [],
+            'dates': []
+        }
+        try:
+            # Company patterns (simplified)
+            company_pattern = r'\b[A-Z][a-zA-Z]+ (?:Inc|Corp|LLC|Ltd|Company|Co)\b'
+            entities['companies'] = list(set(re.findall(company_pattern, text)))
+            # Money patterns
+            money_pattern = r'\$[\d,]+(?:\.\d{2})?(?:\s?(?:million|billion|trillion|k|M|B|T))?'
+            entities['money'] = list(set(re.findall(money_pattern, text)))
+            # Date patterns (simplified)
+            date_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}'
+            entities['dates'] = list(set(re.findall(date_pattern, text)))
+        except Exception as e:
+            logger.error(f"Entity extraction failed: {str(e)}")
+        return entities
+    def calculate_readability(self, text: str) -> Dict[str, float]:
+        """Calculate text readability metrics"""
+        try:
+            sentences = sent_tokenize(text)
+            words = word_tokenize(text)
+            if not sentences or not words:
+                return {'flesch_score': 0.0, 'avg_sentence_length': 0.0, 'avg_word_length': 0.0}
+            # Basic metrics
+            num_sentences = len(sentences)
+            num_words = len(words)
+            num_syllables = sum(self._count_syllables(word) for word in words if word.isalpha())
+            # Average sentence length
+            avg_sentence_length = num_words / num_sentences
+            # Average word length
+            avg_word_length = sum(len(word) for word in words if word.isalpha()) / num_words
+            # Flesch Reading Ease Score (simplified)
+            flesch_score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * (num_syllables / num_words))
+            return {
+                'flesch_score': max(0.0, min(100.0, flesch_score)),
+                'avg_sentence_length': avg_sentence_length,
+                'avg_word_length': avg_word_length
+            }
+        except Exception as e:
+            logger.error(f"Readability calculation failed: {str(e)}")
+            return {'flesch_score': 0.0, 'avg_sentence_length': 0.0, 'avg_word_length': 0.0}
+    def _count_syllables(self, word: str) -> int:
+        """Count syllables in a word (simplified)"""
+        word = word.lower()
+        vowels = 'aeiouy'
+        syllable_count = 0
+        prev_char_was_vowel = False
+        for char in word:
+            if char in vowels:
+                if not prev_char_was_vowel:
+                    syllable_count += 1
+                prev_char_was_vowel = True
+            else:
+                prev_char_was_vowel = False
+        # Handle silent e
+        if word.endswith('e'):
+            syllable_count -= 1
+        # Every word has at least one syllable
+        return max(1, syllable_count)

report_module (1).py ADDED Viewed

	@@ -0,0 +1,606 @@

+import logging
+from typing import Dict, List, Any, Optional
+import io
+from datetime import datetime
+import base64
+# PDF generation
+try:
+    from reportlab.lib.pagesizes import letter, A4
+    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image
+    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+    from reportlab.lib.units import inch
+    from reportlab.lib import colors
+    from reportlab.graphics.shapes import Drawing
+    from reportlab.graphics.charts.piecharts import Pie
+    from reportlab.graphics.charts.barcharts import VerticalBarChart
+    REPORTLAB_AVAILABLE = True
+except ImportError:
+    REPORTLAB_AVAILABLE = False
+# Plotting for charts in PDF
+try:
+    import matplotlib.pyplot as plt
+    import matplotlib
+    matplotlib.use('Agg')  # Use non-interactive backend
+    MATPLOTLIB_AVAILABLE = True
+except ImportError:
+    MATPLOTLIB_AVAILABLE = False
+logger = logging.getLogger(__name__)
+def generate_pdf_report(results: Dict[str, Any]) -> io.BytesIO:
+    """Generate a comprehensive PDF report"""
+    if not REPORTLAB_AVAILABLE:
+        logger.error("ReportLab not available for PDF generation")
+        return _generate_simple_pdf_fallback(results)
+    try:
+        # Create PDF buffer
+        buffer = io.BytesIO()
+        # Create document
+        doc = SimpleDocTemplate(
+            buffer,
+            pagesize=A4,
+            rightMargin=72,
+            leftMargin=72,
+            topMargin=72,
+            bottomMargin=18
+        )
+        # Get styles
+        styles = getSampleStyleSheet()
+        # Create custom styles
+        title_style = ParagraphStyle(
+            'CustomTitle',
+            parent=styles['Heading1'],
+            fontSize=24,
+            spaceAfter=30,
+            textColor=colors.HexColor('#2E86AB'),
+            alignment=1  # Center
+        )
+        heading_style = ParagraphStyle(
+            'CustomHeading',
+            parent=styles['Heading2'],
+            fontSize=16,
+            spaceAfter=12,
+            spaceBefore=20,
+            textColor=colors.HexColor('#2E86AB')
+        )
+        # Build story (content)
+        story = []
+        # Title page
+        story.append(Paragraph("Global Business News Intelligence Report", title_style))
+        story.append(Spacer(1, 0.5*inch))
+        # Query and basic info
+        story.append(Paragraph(f"Analysis Target: {results.get('query', 'N/A')}", styles['Normal']))
+        story.append(Paragraph(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles['Normal']))
+        story.append(Paragraph(f"Total Articles Analyzed: {results.get('total_articles', 0)}", styles['Normal']))
+        story.append(Paragraph(f"Processing Time: {results.get('processing_time', 0):.2f} seconds", styles['Normal']))
+        story.append(Spacer(1, 0.3*inch))
+        # Executive Summary
+        story.append(Paragraph("Executive Summary", heading_style))
+        summary_text = _create_executive_summary(results)
+        story.append(Paragraph(summary_text, styles['Normal']))
+        story.append(Spacer(1, 0.2*inch))
+        # Sentiment Analysis Section
+        story.append(Paragraph("Sentiment Analysis", heading_style))
+        sentiment_data = _create_sentiment_section(results, styles)
+        story.extend(sentiment_data)
+        # Top Stories Section
+        story.append(Paragraph("Key Stories", heading_style))
+        stories_data = _create_stories_section(results, styles)
+        story.extend(stories_data)
+        # Keywords Section
+        if 'keywords' in results and results['keywords']:
+            story.append(Paragraph("Key Topics and Themes", heading_style))
+            keywords_data = _create_keywords_section(results, styles)
+            story.extend(keywords_data)
+        # Sources Section
+        story.append(Paragraph("News Sources", heading_style))
+        sources_data = _create_sources_section(results, styles)
+        story.extend(sources_data)
+        # Methodology Section
+        story.append(Paragraph("Methodology", heading_style))
+        methodology_text = _create_methodology_section(results)
+        story.append(Paragraph(methodology_text, styles['Normal']))
+        # Build PDF
+        doc.build(story)
+        buffer.seek(0)
+        return buffer
+    except Exception as e:
+        logger.error(f"PDF generation failed: {str(e)}")
+        return _generate_simple_pdf_fallback(results)
+def _create_executive_summary(results: Dict[str, Any]) -> str:
+    """Create executive summary text"""
+    try:
+        query = results.get('query', 'the analyzed topic')
+        total_articles = results.get('total_articles', 0)
+        avg_sentiment = results.get('average_sentiment', 0)
+        sentiment_label = "positive" if avg_sentiment > 0.1 else "negative" if avg_sentiment < -0.1 else "neutral"
+        summary = f"This report analyzes {total_articles} news articles related to {query}. "
+        summary += f"The overall sentiment analysis reveals a {sentiment_label} tone with an average sentiment score of {avg_sentiment:.3f}. "
+        # Add sentiment distribution
+        dist = results.get('sentiment_distribution', {})
+        positive = dist.get('Positive', 0)
+        negative = dist.get('Negative', 0)
+        neutral = dist.get('Neutral', 0)
+        summary += f"The analysis shows {positive} positive articles ({positive/total_articles*100:.1f}%), "
+        summary += f"{negative} negative articles ({negative/total_articles*100:.1f}%), "
+        summary += f"and {neutral} neutral articles ({neutral/total_articles*100:.1f}%). "
+        # Add key insights
+        if avg_sentiment > 0.2:
+            summary += "The predominantly positive coverage suggests favorable market conditions or public perception."
+        elif avg_sentiment < -0.2:
+            summary += "The predominantly negative coverage indicates concerns or challenges that may require attention."
+        else:
+            summary += "The balanced sentiment coverage suggests a mixed outlook with both opportunities and challenges present."
+        return summary
+    except Exception as e:
+        logger.error(f"Executive summary creation failed: {str(e)}")
+        return "Analysis completed successfully with comprehensive sentiment evaluation across multiple news sources."
+def _create_sentiment_section(results: Dict[str, Any], styles) -> List:
+    """Create sentiment analysis section"""
+    story = []
+    try:
+        # Sentiment distribution table
+        dist = results.get('sentiment_distribution', {})
+        sentiment_data = [
+            ['Sentiment', 'Count', 'Percentage'],
+            ['Positive', str(dist.get('Positive', 0)), f"{dist.get('Positive', 0)/results.get('total_articles', 1)*100:.1f}%"],
+            ['Negative', str(dist.get('Negative', 0)), f"{dist.get('Negative', 0)/results.get('total_articles', 1)*100:.1f}%"],
+            ['Neutral', str(dist.get('Neutral', 0)), f"{dist.get('Neutral', 0)/results.get('total_articles', 1)*100:.1f}%"]
+        ]
+        sentiment_table = Table(sentiment_data)
+        sentiment_table.setStyle(TableStyle([
+            ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
+            ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+            ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
+            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+            ('FONTSIZE', (0, 0), (-1, 0), 12),
+            ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
+            ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
+            ('GRID', (0, 0), (-1, -1), 1, colors.black)
+        ]))
+        story.append(sentiment_table)
+        story.append(Spacer(1, 0.2*inch))
+        # Add sentiment analysis explanation
+        explanation = "Sentiment analysis was performed using multiple models including VADER, Loughran-McDonald financial dictionary, and FinBERT. "
+        explanation += "Scores range from -1.0 (most negative) to +1.0 (most positive), with scores between -0.1 and +0.1 considered neutral."
+        story.append(Paragraph(explanation, styles['Normal']))
+        story.append(Spacer(1, 0.2*inch))
+    except Exception as e:
+        logger.error(f"Sentiment section creation failed: {str(e)}")
+        story.append(Paragraph("Sentiment analysis data unavailable.", styles['Normal']))
+    return story
+def _create_stories_section(results: Dict[str, Any], styles) -> List:
+    """Create top stories section"""
+    story = []
+    try:
+        articles = results.get('articles', [])
+        if not articles:
+            story.append(Paragraph("No articles available for analysis.", styles['Normal']))
+            return story
+        # Sort articles by sentiment score
+        sorted_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0), reverse=True)
+        # Most positive story
+        if sorted_articles and sorted_articles[0].get('sentiment', {}).get('compound', 0) > 0.1:
+            story.append(Paragraph("Most Positive Coverage:", styles['Heading3']))
+            top_positive = sorted_articles[0]
+            story.append(Paragraph(f"<b>Title:</b> {top_positive.get('title', 'N/A')}", styles['Normal']))
+            story.append(Paragraph(f"<b>Source:</b> {top_positive.get('source', 'N/A')}", styles['Normal']))
+            story.append(Paragraph(f"<b>Sentiment Score:</b> {top_positive.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
+            if 'summary' in top_positive:
+                story.append(Paragraph(f"<b>Summary:</b> {top_positive['summary'][:300]}...", styles['Normal']))
+            story.append(Spacer(1, 0.2*inch))
+        # Most negative story
+        negative_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0))
+        if negative_articles and negative_articles[0].get('sentiment', {}).get('compound', 0) < -0.1:
+            story.append(Paragraph("Most Negative Coverage:", styles['Heading3']))
+            top_negative = negative_articles[0]
+            story.append(Paragraph(f"<b>Title:</b> {top_negative.get('title', 'N/A')}", styles['Normal']))
+            story.append(Paragraph(f"<b>Source:</b> {top_negative.get('source', 'N/A')}", styles['Normal']))
+            story.append(Paragraph(f"<b>Sentiment Score:</b> {top_negative.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
+            if 'summary' in top_negative:
+                story.append(Paragraph(f"<b>Summary:</b> {top_negative['summary'][:300]}...", styles['Normal']))
+            story.append(Spacer(1, 0.2*inch))
+        # Recent stories (if dates available)
+        recent_articles = [a for a in articles if a.get('date')]
+        if recent_articles:
+            recent_articles.sort(key=lambda x: x.get('date', ''), reverse=True)
+            story.append(Paragraph("Most Recent Coverage:", styles['Heading3']))
+            recent = recent_articles[0]
+            story.append(Paragraph(f"<b>Title:</b> {recent.get('title', 'N/A')}", styles['Normal']))
+            story.append(Paragraph(f"<b>Source:</b> {recent.get('source', 'N/A')}", styles['Normal']))
+            story.append(Paragraph(f"<b>Date:</b> {recent.get('date', 'N/A')}", styles['Normal']))
+            story.append(Paragraph(f"<b>Sentiment Score:</b> {recent.get('sentiment', {}).get('compound', 0):.3f}", styles['Normal']))
+    except Exception as e:
+        logger.error(f"Stories section creation failed: {str(e)}")
+        story.append(Paragraph("Story analysis data unavailable.", styles['Normal']))
+    return story
+def _create_keywords_section(results: Dict[str, Any], styles) -> List:
+    """Create keywords section"""
+    story = []
+    try:
+        keywords = results.get('keywords', [])[:15]  # Top 15 keywords
+        if not keywords:
+            story.append(Paragraph("No keywords extracted.", styles['Normal']))
+            return story
+        # Create keywords table
+        keyword_data = [['Keyword', 'Relevance Score', 'Category']]
+        for kw in keywords:
+            relevance = kw.get('relevance', 'medium')
+            score = kw.get('score', 0)
+            keyword_data.append([
+                kw.get('keyword', 'N/A'),
+                f"{score:.3f}",
+                relevance.title()
+            ])
+        keyword_table = Table(keyword_data)
+        keyword_table.setStyle(TableStyle([
+            ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
+            ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
+            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+            ('FONTSIZE', (0, 0), (-1, 0), 10),
+            ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
+            ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
+            ('GRID', (0, 0), (-1, -1), 1, colors.black)
+        ]))
+        story.append(keyword_table)
+        story.append(Spacer(1, 0.2*inch))
+        # Keywords explanation
+        explanation = "Keywords were extracted using the YAKE (Yet Another Keyword Extractor) algorithm, "
+        explanation += "which identifies the most relevant terms and phrases based on statistical analysis of the text corpus."
+        story.append(Paragraph(explanation, styles['Normal']))
+    except Exception as e:
+        logger.error(f"Keywords section creation failed: {str(e)}")
+        story.append(Paragraph("Keyword analysis data unavailable.", styles['Normal']))
+    return story
+def _create_sources_section(results: Dict[str, Any], styles) -> List:
+    """Create news sources section"""
+    story = []
+    try:
+        articles = results.get('articles', [])
+        if not articles:
+            story.append(Paragraph("No source data available.", styles['Normal']))
+            return story
+        # Count sources
+        source_counts = {}
+        for article in articles:
+            source = article.get('source', 'Unknown')
+            source_counts[source] = source_counts.get(source, 0) + 1
+        # Create sources table
+        source_data = [['News Source', 'Article Count', 'Percentage']]
+        total_articles = len(articles)
+        for source, count in sorted(source_counts.items(), key=lambda x: x[1], reverse=True):
+            percentage = (count / total_articles) * 100
+            source_data.append([source, str(count), f"{percentage:.1f}%"])
+        sources_table = Table(source_data)
+        sources_table.setStyle(TableStyle([
+            ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
+            ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
+            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+            ('FONTSIZE', (0, 0), (-1, 0), 10),
+            ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
+            ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
+            ('GRID', (0, 0), (-1, -1), 1, colors.black)
+        ]))
+        story.append(sources_table)
+        story.append(Spacer(1, 0.2*inch))
+        # Sources explanation
+        explanation = f"Articles were collected from {len(source_counts)} different news sources, "
+        explanation += "providing diverse perspectives on the analyzed topic. Source diversity helps ensure comprehensive coverage and reduces bias."
+        story.append(Paragraph(explanation, styles['Normal']))
+    except Exception as e:
+        logger.error(f"Sources section creation failed: {str(e)}")
+        story.append(Paragraph("Source analysis data unavailable.", styles['Normal']))
+    return story
+def _create_methodology_section(results: Dict[str, Any]) -> str:
+    """Create methodology section text"""
+    methodology = "This analysis employed a comprehensive natural language processing pipeline:\n\n"
+    methodology += "1. <b>Data Collection:</b> News articles were scraped from multiple reliable sources using RSS feeds and web scraping techniques. "
+    methodology += "Content was filtered for relevance and deduplicated to ensure quality.\n\n"
+    methodology += "2. <b>Sentiment Analysis:</b> Three complementary models were used: "
+    methodology += "VADER (general sentiment), Loughran-McDonald dictionary (financial sentiment), and FinBERT (financial domain-specific). "
+    methodology += "Final scores represent a weighted combination of all models.\n\n"
+    methodology += "3. <b>Text Processing:</b> Articles were cleaned, summarized using transformer models, and analyzed for key themes. "
+    methodology += "Keyword extraction employed the YAKE algorithm for statistical relevance.\n\n"
+    methodology += "4. <b>Quality Assurance:</b> All content was filtered for English language, minimum length requirements, and relevance to the query terms. "
+    methodology += "Results were validated across multiple model outputs for consistency.\n\n"
+    if results.get('processing_time'):
+        methodology += f"Total processing time: {results['processing_time']:.2f} seconds for {results.get('total_articles', 0)} articles."
+    return methodology
+def _generate_simple_pdf_fallback(results: Dict[str, Any]) -> io.BytesIO:
+    """Generate a simple text-based PDF fallback"""
+    try:
+        from fpdf import FPDF
+        pdf = FPDF()
+        pdf.add_page()
+        pdf.set_font('Arial', 'B', 16)
+        pdf.cell(40, 10, 'News Analysis Report')
+        pdf.ln(20)
+        pdf.set_font('Arial', '', 12)
+        pdf.cell(40, 10, f"Query: {results.get('query', 'N/A')}")
+        pdf.ln(10)
+        pdf.cell(40, 10, f"Articles: {results.get('total_articles', 0)}")
+        pdf.ln(10)
+        pdf.cell(40, 10, f"Average Sentiment: {results.get('average_sentiment', 0):.3f}")
+        pdf.ln(20)
+        # Simple sentiment distribution
+        dist = results.get('sentiment_distribution', {})
+        pdf.cell(40, 10, 'Sentiment Distribution:')
+        pdf.ln(10)
+        pdf.cell(40, 10, f"Positive: {dist.get('Positive', 0)}")
+        pdf.ln(10)
+        pdf.cell(40, 10, f"Negative: {dist.get('Negative', 0)}")
+        pdf.ln(10)
+        pdf.cell(40, 10, f"Neutral: {dist.get('Neutral', 0)}")
+        # Save to buffer
+        buffer = io.BytesIO()
+        pdf_string = pdf.output(dest='S').encode('latin1')
+        buffer.write(pdf_string)
+        buffer.seek(0)
+        return buffer
+    except Exception as e:
+        logger.error(f"PDF fallback failed: {str(e)}")
+        # Return empty buffer as last resort
+        buffer = io.BytesIO()
+        buffer.write(b"PDF generation failed. Please check logs.")
+        buffer.seek(0)
+        return buffer
+def create_chart_image(data: Dict, chart_type: str = 'pie') -> Optional[str]:
+    """Create a chart image for PDF inclusion"""
+    if not MATPLOTLIB_AVAILABLE:
+        return None
+    try:
+        plt.figure(figsize=(6, 4))
+        if chart_type == 'pie' and 'sentiment_distribution' in data:
+            dist = data['sentiment_distribution']
+            labels = ['Positive', 'Negative', 'Neutral']
+            sizes = [dist.get('Positive', 0), dist.get('Negative', 0), dist.get('Neutral', 0)]
+            colors = ['#28a745', '#dc3545', '#6c757d']
+            plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
+            plt.title('Sentiment Distribution')
+        elif chart_type == 'bar' and 'articles' in data:
+            articles = data['articles']
+            sources = {}
+            for article in articles:
+                source = article.get('source', 'Unknown')
+                sources[source] = sources.get(source, 0) + 1
+            # Top 10 sources
+            top_sources = dict(sorted(sources.items(), key=lambda x: x[1], reverse=True)[:10])
+            plt.bar(range(len(top_sources)), list(top_sources.values()), color='#2E86AB')
+            plt.xticks(range(len(top_sources)), list(top_sources.keys()), rotation=45, ha='right')
+            plt.title('Articles by Source')
+            plt.ylabel('Article Count')
+            plt.tight_layout()
+        # Save to base64 string
+        buffer = io.BytesIO()
+        plt.savefig(buffer, format='png', dpi=150, bbox_inches='tight')
+        buffer.seek(0)
+        image_base64 = base64.b64encode(buffer.getvalue()).decode()
+        plt.close()
+        return image_base64
+    except Exception as e:
+        logger.error(f"Chart creation failed: {str(e)}")
+        return None
+def generate_csv_report(results: Dict[str, Any]) -> str:
+    """Generate CSV report"""
+    try:
+        import csv
+        import io
+        output = io.StringIO()
+        writer = csv.writer(output)
+        # Write header
+        writer.writerow([
+            'Title', 'Source', 'URL', 'Date', 'Sentiment_Score', 'Sentiment_Label',
+            'VADER_Score', 'LM_Score', 'FinBERT_Score', 'Summary'
+        ])
+        # Write article data
+        articles = results.get('articles', [])
+        for article in articles:
+            sentiment = article.get('sentiment', {})
+            compound = sentiment.get('compound', 0)
+            # Determine sentiment label
+            if compound > 0.1:
+                label = 'Positive'
+            elif compound < -0.1:
+                label = 'Negative'
+            else:
+                label = 'Neutral'
+            writer.writerow([
+                article.get('title', ''),
+                article.get('source', ''),
+                article.get('url', ''),
+                article.get('date', ''),
+                compound,
+                label,
+                sentiment.get('vader', ''),
+                sentiment.get('loughran_mcdonald', ''),
+                sentiment.get('finbert', ''),
+                article.get('summary', '')[:200] + '...' if len(article.get('summary', '')) > 200 else article.get('summary', '')
+            ])
+        return output.getvalue()
+    except Exception as e:
+        logger.error(f"CSV generation failed: {str(e)}")
+        return "Error generating CSV report"
+def generate_json_report(results: Dict[str, Any]) -> str:
+    """Generate JSON report with formatted output"""
+    try:
+        import json
+        from datetime import datetime
+        # Create comprehensive report
+        report = {
+            'metadata': {
+                'report_generated': datetime.now().isoformat(),
+                'query': results.get('query', ''),
+                'total_articles': results.get('total_articles', 0),
+                'processing_time_seconds': results.get('processing_time', 0),
+                'languages': results.get('languages', ['English'])
+            },
+            'summary': {
+                'average_sentiment': results.get('average_sentiment', 0),
+                'sentiment_distribution': results.get('sentiment_distribution', {}),
+                'top_sources': _get_top_sources(results),
+                'date_range': results.get('summary', {}).get('date_range', {})
+            },
+            'articles': results.get('articles', []),
+            'keywords': results.get('keywords', [])[:20],  # Top 20 keywords
+            'analysis_methods': {
+                'sentiment_models': ['VADER', 'Loughran-McDonald', 'FinBERT'],
+                'summarization_model': 'DistilBART',
+                'keyword_extraction': 'YAKE',
+                'translation_models': ['Helsinki-NLP Opus-MT']
+            }
+        }
+        return json.dumps(report, indent=2, default=str, ensure_ascii=False)
+    except Exception as e:
+        logger.error(f"JSON generation failed: {str(e)}")
+        return json.dumps({'error': str(e)}, indent=2)
+def _get_top_sources(results: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """Get top news sources from results"""
+    try:
+        articles = results.get('articles', [])
+        sources = {}
+        for article in articles:
+            source = article.get('source', 'Unknown')
+            sources[source] = sources.get(source, 0) + 1
+        # Convert to list and sort
+        source_list = [
+            {'source': source, 'count': count, 'percentage': round((count / len(articles)) * 100, 1)}
+            for source, count in sources.items()
+        ]
+        return sorted(source_list, key=lambda x: x['count'], reverse=True)[:10]
+    except Exception as e:
+        logger.error(f"Top sources calculation failed: {str(e)}")
+        return []
+def validate_report_data(results: Dict[str, Any]) -> bool:
+    """Validate that results contain required data for reporting"""
+    required_keys = ['query', 'articles', 'total_articles']
+    for key in required_keys:
+        if key not in results:
+            logger.error(f"Missing required key for reporting: {key}")
+            return False
+    if not isinstance(results['articles'], list) or len(results['articles']) == 0:
+        logger.error("No articles available for reporting")
+        return False
+    return True
+# Export functions
+__all__ = [
+    'generate_pdf_report',
+    'generate_csv_report',
+    'generate_json_report',
+    'create_chart_image',
+    'validate_report_data'
+]

requirements_file.txt ADDED Viewed

	@@ -0,0 +1,47 @@

+# Core Framework
+streamlit==1.28.1
+fastapi==0.104.1
+uvicorn==0.24.0
+# Web Scraping & RSS
+requests==2.31.0
+beautifulsoup4==4.12.2
+feedparser==6.0.10
+trafilatura==1.6.2
+lxml==4.9.3
+# NLP & Machine Learning
+transformers==4.35.2
+torch==2.1.0 --index-url https://download.pytorch.org/whl/cpu
+nltk==3.8.1
+langdetect==1.0.9
+yake==0.4.8
+vaderSentiment==3.3.2
+# Data Processing
+pandas==2.0.3
+numpy==1.24.3
+# Visualization
+plotly==5.17.0
+matplotlib==3.7.2
+wordcloud==1.9.2
+# Translation & Audio
+gtts==2.4.0
+# Report Generation
+reportlab==4.0.4
+fpdf2==2.7.6
+# Utilities
+python-dotenv==1.0.0
+psutil==5.9.5
+Pillow==10.0.1
+# HTTP & Async
+httpx==0.25.0
+aiofiles==23.2.1
+# Caching
+diskcache==5.6.3

scraper_module.py ADDED Viewed

	@@ -0,0 +1,396 @@

+import requests
+from bs4 import BeautifulSoup
+import feedparser
+import trafilatura
+from urllib.parse import urljoin, urlparse
+import time
+import logging
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional, Set
+import hashlib
+import re
+from langdetect import detect
+import random
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+logger = logging.getLogger(__name__)
+class NewsletterScraper:
+    """Robust news scraper with multiple sources and deduplication"""
+    def __init__(self):
+        self.session = self._create_session()
+        self.scraped_urls: Set[str] = set()
+        self.content_hashes: Set[str] = set()
+        # News sources configuration
+        self.rss_sources = {
+            'google_news': 'https://news.google.com/rss/search?q={}&hl=en&gl=US&ceid=US:en',
+            'yahoo_finance': 'https://feeds.finance.yahoo.com/rss/2.0/headline',
+            'reuters_business': 'https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best',
+            'bbc_business': 'http://feeds.bbci.co.uk/news/business/rss.xml',
+            'cnbc': 'https://www.cnbc.com/id/100003114/device/rss/rss.html',
+            'marketwatch': 'http://feeds.marketwatch.com/marketwatch/topstories/',
+            'financial_times': 'https://www.ft.com/rss/home',
+            'bloomberg': 'https://feeds.bloomberg.com/politics/news.rss'
+        }
+        self.user_agents = [
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
+        ]
+        logger.info("NewsletterScraper initialized")
+    def _create_session(self) -> requests.Session:
+        """Create a session with retry strategy"""
+        session = requests.Session()
+        # Retry strategy
+        retry_strategy = Retry(
+            total=3,
+            backoff_factor=1,
+            status_forcelist=[429, 500, 502, 503, 504],
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        session.mount("http://", adapter)
+        session.mount("https://", adapter)
+        return session
+    def _get_random_headers(self) -> Dict[str, str]:
+        """Get randomized headers to avoid blocking"""
+        return {
+            'User-Agent': random.choice(self.user_agents),
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+        }
+    def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
+        """Main scraping function"""
+        logger.info(f"Starting news scraping for query: {query}")
+        all_articles = []
+        self.scraped_urls.clear()
+        self.content_hashes.clear()
+        try:
+            # Primary: Google News RSS
+            google_articles = self._scrape_google_news(query, max_articles // 2)
+            all_articles.extend(google_articles)
+            # Secondary: Other RSS sources
+            for source_name, rss_url in list(self.rss_sources.items())[1:4]:  # Limit to avoid timeouts
+                if len(all_articles) >= max_articles:
+                    break
+                try:
+                    source_articles = self._scrape_rss_source(rss_url, query, 5)
+                    all_articles.extend(source_articles)
+                except Exception as e:
+                    logger.warning(f"Failed to scrape {source_name}: {str(e)}")
+                    continue
+            # Deduplicate and filter
+            articles = self._deduplicate_articles(all_articles)
+            articles = self._filter_articles(articles, query)
+            articles = articles[:max_articles]
+            # Extract full content
+            for article in articles:
+                try:
+                    full_content = self._extract_full_content(article['url'])
+                    if full_content and len(full_content) > 200:
+                        article['content'] = full_content
+                    else:
+                        article['content'] = article.get('summary', article.get('title', ''))
+                except Exception as e:
+                    logger.warning(f"Failed to extract content from {article['url']}: {str(e)}")
+                    article['content'] = article.get('summary', article.get('title', ''))
+            # Filter by language (English only)
+            articles = [article for article in articles if self._is_english(article['content'])]
+            logger.info(f"Successfully scraped {len(articles)} articles")
+            return articles
+        except Exception as e:
+            logger.error(f"Error in scrape_news: {str(e)}")
+            return []
+    def _scrape_google_news(self, query: str, max_articles: int) -> List[Dict]:
+        """Scrape Google News RSS"""
+        try:
+            url = self.rss_sources['google_news'].format(query.replace(' ', '%20'))
+            headers = self._get_random_headers()
+            response = self.session.get(url, headers=headers, timeout=10)
+            if response.status_code != 200:
+                logger.warning(f"Google News RSS returned status {response.status_code}")
+                return []
+            feed = feedparser.parse(response.content)
+            articles = []
+            for entry in feed.entries[:max_articles * 2]:  # Get extra for filtering
+                try:
+                    article = {
+                        'title': entry.title,
+                        'url': entry.link,
+                        'summary': entry.get('summary', ''),
+                        'date': self._parse_date(entry.get('published', '')),
+                        'source': 'Google News'
+                    }
+                    # Skip if already seen
+                    if article['url'] in self.scraped_urls:
+                        continue
+                    self.scraped_urls.add(article['url'])
+                    articles.append(article)
+                except Exception as e:
+                    logger.warning(f"Error parsing Google News entry: {str(e)}")
+                    continue
+            return articles
+        except Exception as e:
+            logger.error(f"Error scraping Google News: {str(e)}")
+            return []
+    def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
+        """Scrape a generic RSS source"""
+        try:
+            headers = self._get_random_headers()
+            response = self.session.get(rss_url, headers=headers, timeout=10)
+            if response.status_code != 200:
+                return []
+            feed = feedparser.parse(response.content)
+            articles = []
+            query_lower = query.lower()
+            for entry in feed.entries[:max_articles * 3]:  # Get extra for filtering
+                try:
+                    title = entry.get('title', '')
+                    summary = entry.get('summary', '')
+                    # Check if article is relevant to query
+                    if not (query_lower in title.lower() or query_lower in summary.lower()):
+                        continue
+                    article = {
+                        'title': title,
+                        'url': entry.get('link', ''),
+                        'summary': summary,
+                        'date': self._parse_date(entry.get('published', '')),
+                        'source': self._extract_source_name(rss_url)
+                    }
+                    # Skip if already seen
+                    if article['url'] in self.scraped_urls:
+                        continue
+                    self.scraped_urls.add(article['url'])
+                    articles.append(article)
+                    if len(articles) >= max_articles:
+                        break
+                except Exception as e:
+                    logger.warning(f"Error parsing RSS entry: {str(e)}")
+                    continue
+            # Small delay to be respectful
+            time.sleep(0.5)
+            return articles
+        except Exception as e:
+            logger.error(f"Error scraping RSS {rss_url}: {str(e)}")
+            return []
+    def _extract_full_content(self, url: str) -> Optional[str]:
+        """Extract full article content using trafilatura"""
+        try:
+            headers = self._get_random_headers()
+            # Download the page
+            downloaded = trafilatura.fetch_url(url, headers=headers)
+            if not downloaded:
+                return None
+            # Extract text content
+            text = trafilatura.extract(
+                downloaded,
+                include_comments=False,
+                include_tables=False,
+                include_formatting=False,
+                no_fallback=False
+            )
+            if text and len(text.strip()) > 100:
+                return text.strip()
+            return None
+        except Exception as e:
+            logger.warning(f"Error extracting content from {url}: {str(e)}")
+            return None
+    def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
+        """Remove duplicate articles based on content similarity"""
+        unique_articles = []
+        for article in articles:
+            # Create content hash
+            content_for_hash = f"{article['title']} {article.get('summary', '')}"
+            content_hash = hashlib.md5(content_for_hash.encode()).hexdigest()
+            if content_hash not in self.content_hashes:
+                self.content_hashes.add(content_hash)
+                unique_articles.append(article)
+        logger.info(f"Deduplicated {len(articles)} -> {len(unique_articles)} articles")
+        return unique_articles
+    def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
+        """Filter articles for relevance and quality"""
+        filtered_articles = []
+        query_lower = query.lower()
+        for article in articles:
+            # Check minimum content length
+            title_summary = f"{article['title']} {article.get('summary', '')}"
+            if len(title_summary.strip()) < 50:
+                continue
+            # Check relevance (more flexible than RSS filtering)
+            if (query_lower in article['title'].lower() or
+                query_lower in article.get('summary', '').lower() or
+                any(word in article['title'].lower() for word in query_lower.split())):
+                filtered_articles.append(article)
+        logger.info(f"Filtered {len(articles)} -> {len(filtered_articles)} articles for relevance")
+        return filtered_articles
+    def _is_english(self, text: str) -> bool:
+        """Check if text is in English using language detection"""
+        try:
+            if len(text.strip()) < 20:
+                return True  # Assume short text is English
+            detected_lang = detect(text[:1000])  # Check first 1000 chars
+            return detected_lang == 'en'
+        except Exception:
+            # If detection fails, assume English
+            return True
+    def _parse_date(self, date_str: str) -> Optional[datetime]:
+        """Parse date from RSS feed"""
+        if not date_str:
+            return datetime.now()
+        try:
+            # Try common RSS date formats
+            for fmt in ['%a, %d %b %Y %H:%M:%S %Z',
+                       '%Y-%m-%dT%H:%M:%SZ',
+                       '%Y-%m-%d %H:%M:%S']:
+                try:
+                    return datetime.strptime(date_str.strip(), fmt)
+                except ValueError:
+                    continue
+            # If all fails, return current time
+            return datetime.now()
+        except Exception:
+            return datetime.now()
+    def _extract_source_name(self, url: str) -> str:
+        """Extract source name from URL"""
+        try:
+            domain = urlparse(url).netloc
+            # Clean up common domain patterns
+            domain = domain.replace('www.', '').replace('feeds.', '')
+            # Map known domains to clean names
+            domain_mapping = {
+                'news.google.com': 'Google News',
+                'finance.yahoo.com': 'Yahoo Finance',
+                'reuters.com': 'Reuters',
+                'reutersagency.com': 'Reuters',
+                'bbc.co.uk': 'BBC',
+                'cnbc.com': 'CNBC',
+                'marketwatch.com': 'MarketWatch',
+                'ft.com': 'Financial Times',
+                'bloomberg.com': 'Bloomberg'
+            }
+            return domain_mapping.get(domain, domain.title())
+        except Exception:
+            return 'Unknown'
+    def get_available_sources(self) -> List[str]:
+        """Get list of available news sources"""
+        return list(self.rss_sources.keys())
+# Additional utility functions for scraping
+def clean_html(html_content: str) -> str:
+    """Clean HTML content and extract text"""
+    try:
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.extract()
+        # Get text
+        text = soup.get_text()
+        # Clean up whitespace
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+        return text
+    except Exception as e:
+        logger.error(f"Error cleaning HTML: {str(e)}")
+        return ""
+def is_valid_article_url(url: str) -> bool:
+    """Check if URL is likely to be a valid article URL"""
+    try:
+        parsed = urlparse(url)
+        # Skip certain file types
+        skip_extensions = ['.pdf', '.jpg', '.png', '.gif', '.mp4', '.mp3']
+        if any(url.lower().endswith(ext) for ext in skip_extensions):
+            return False
+        # Skip obvious non-article URLs
+        skip_patterns = ['login', 'register', 'subscribe', 'newsletter', 'sitemap']
+        if any(pattern in url.lower() for pattern in skip_patterns):
+            return False
+        return True
+    except Exception:
+        return False

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,562 @@

+import streamlit as st
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+import asyncio
+import json
+import base64
+from datetime import datetime
+import io
+import os
+# Import our modules
+from api import NewsAnalyzer
+from utils import load_config, cache_results
+from report import generate_pdf_report
+# Configure page
+st.set_page_config(
+    page_title="Global Business News Intelligence Dashboard",
+    page_icon="📊",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: bold;
+        text-align: center;
+        color: #2E86AB;
+        margin-bottom: 2rem;
+    }
+    .metric-card {
+        background-color: #f0f2f6;
+        padding: 1rem;
+        border-radius: 10px;
+        border-left: 4px solid #2E86AB;
+    }
+    .sentiment-positive { color: #28a745; font-weight: bold; }
+    .sentiment-negative { color: #dc3545; font-weight: bold; }
+    .sentiment-neutral { color: #6c757d; font-weight: bold; }
+    .audio-container {
+        background-color: #f8f9fa;
+        padding: 10px;
+        border-radius: 5px;
+        margin: 10px 0;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Initialize session state
+if 'analyzer' not in st.session_state:
+    st.session_state.analyzer = NewsAnalyzer()
+if 'results' not in st.session_state:
+    st.session_state.results = None
+if 'analysis_complete' not in st.session_state:
+    st.session_state.analysis_complete = False
+def main():
+    # Header
+    st.markdown('<h1 class="main-header">🌐 Global Business News Intelligence Dashboard</h1>', unsafe_allow_html=True)
+    st.markdown("**Real-time sentiment analysis, multilingual summaries, and audio insights for business intelligence**")
+    # Sidebar
+    with st.sidebar:
+        st.header("⚙️ Configuration")
+        # Input section
+        st.subheader("🎯 Target Analysis")
+        query_type = st.selectbox("Query Type", ["Company", "Stock Ticker", "Keyword", "Industry"])
+        query = st.text_input(f"Enter {query_type}:", placeholder="e.g., Tesla, TSLA, AI technology")
+        st.subheader("📊 Analysis Settings")
+        num_articles = st.slider("Number of Articles", 5, 50, 20)
+        languages = st.multiselect(
+            "Summary Languages",
+            ["English", "Hindi", "Tamil"],
+            default=["English"]
+        )
+        include_audio = st.checkbox("Generate Audio Summaries", True)
+        st.subheader("🔧 Model Settings")
+        sentiment_models = st.multiselect(
+            "Sentiment Models",
+            ["VADER", "Loughran-McDonald", "FinBERT"],
+            default=["VADER", "Loughran-McDonald", "FinBERT"]
+        )
+        # Analysis button
+        analyze_button = st.button("🚀 Analyze News", type="primary", use_container_width=True)
+    # Main content area
+    if analyze_button and query:
+        st.session_state.analysis_complete = False
+        with st.spinner("🔍 Analyzing news articles... This may take a few minutes."):
+            try:
+                # Create progress bar
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                # Run analysis
+                config = {
+                    'query': query,
+                    'num_articles': num_articles,
+                    'languages': languages,
+                    'include_audio': include_audio,
+                    'sentiment_models': sentiment_models
+                }
+                # Update progress
+                status_text.text("🔍 Scraping articles...")
+                progress_bar.progress(20)
+                results = st.session_state.analyzer.analyze_news(config, progress_callback=update_progress)
+                st.session_state.results = results
+                st.session_state.analysis_complete = True
+                progress_bar.progress(100)
+                status_text.text("✅ Analysis complete!")
+            except Exception as e:
+                st.error(f"Error during analysis: {str(e)}")
+                st.session_state.analysis_complete = False
+    # Display results
+    if st.session_state.analysis_complete and st.session_state.results:
+        display_results(st.session_state.results)
+    elif not st.session_state.analysis_complete and query:
+        st.info("👆 Click 'Analyze News' to start the analysis")
+    else:
+        show_demo_dashboard()
+def update_progress(progress, status):
+    """Callback function for progress updates"""
+    # This would be called from the analyzer
+    pass
+def display_results(results):
+    """Display analysis results with interactive dashboard"""
+    st.header(f"📈 Analysis Results for: {results['query']}")
+    # Key metrics
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
+        st.metric("Articles Analyzed", len(results['articles']))
+        st.markdown('</div>', unsafe_allow_html=True)
+    with col2:
+        avg_sentiment = results['summary']['average_sentiment']
+        sentiment_color = "sentiment-positive" if avg_sentiment > 0.1 else "sentiment-negative" if avg_sentiment < -0.1 else "sentiment-neutral"
+        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
+        st.metric("Average Sentiment", f"{avg_sentiment:.3f}")
+        st.markdown('</div>', unsafe_allow_html=True)
+    with col3:
+        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
+        st.metric("Sources", len(set([article['source'] for article in results['articles']])))
+        st.markdown('</div>', unsafe_allow_html=True)
+    with col4:
+        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
+        st.metric("Languages", len(results.get('languages', ['English'])))
+        st.markdown('</div>', unsafe_allow_html=True)
+    # Tabs for different views
+    tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["📊 Dashboard", "📰 Articles", "🎯 Sentiment", "🗣️ Audio", "📤 Export", "🔌 API"])
+    with tab1:
+        display_dashboard(results)
+    with tab2:
+        display_articles(results)
+    with tab3:
+        display_sentiment_analysis(results)
+    with tab4:
+        display_audio_summaries(results)
+    with tab5:
+        display_export_options(results)
+    with tab6:
+        display_api_info(results)
+def display_dashboard(results):
+    """Display main dashboard with charts"""
+    col1, col2 = st.columns(2)
+    with col1:
+        # Sentiment distribution
+        st.subheader("📊 Sentiment Distribution")
+        sentiment_counts = {
+            'Positive': sum(1 for article in results['articles'] if article['sentiment']['compound'] > 0.1),
+            'Negative': sum(1 for article in results['articles'] if article['sentiment']['compound'] < -0.1),
+            'Neutral': sum(1 for article in results['articles'] if -0.1 <= article['sentiment']['compound'] <= 0.1)
+        }
+        fig_pie = px.pie(
+            values=list(sentiment_counts.values()),
+            names=list(sentiment_counts.keys()),
+            color_discrete_map={'Positive': '#28a745', 'Negative': '#dc3545', 'Neutral': '#6c757d'}
+        )
+        st.plotly_chart(fig_pie, use_container_width=True)
+    with col2:
+        # Source distribution
+        st.subheader("📰 Source Distribution")
+        source_counts = {}
+        for article in results['articles']:
+            source = article['source']
+            source_counts[source] = source_counts.get(source, 0) + 1
+        fig_bar = px.bar(
+            x=list(source_counts.keys()),
+            y=list(source_counts.values()),
+            color=list(source_counts.values()),
+            color_continuous_scale="viridis"
+        )
+        fig_bar.update_layout(xaxis_title="Source", yaxis_title="Article Count")
+        st.plotly_chart(fig_bar, use_container_width=True)
+    # Timeline chart
+    st.subheader("📈 Sentiment Over Time")
+    if results['articles']:
+        df_timeline = pd.DataFrame([
+            {
+                'date': article.get('date', datetime.now()),
+                'sentiment': article['sentiment']['compound'],
+                'title': article['title'][:50] + "..." if len(article['title']) > 50 else article['title']
+            }
+            for article in results['articles']
+            if 'date' in article
+        ])
+        if not df_timeline.empty:
+            fig_timeline = px.scatter(
+                df_timeline,
+                x='date',
+                y='sentiment',
+                hover_data=['title'],
+                color='sentiment',
+                color_continuous_scale=['red', 'gray', 'green'],
+                color_continuous_midpoint=0
+            )
+            fig_timeline.update_layout(
+                xaxis_title="Date",
+                yaxis_title="Sentiment Score",
+                yaxis=dict(range=[-1, 1])
+            )
+            st.plotly_chart(fig_timeline, use_container_width=True)
+    # Keywords word cloud
+    st.subheader("🔤 Key Topics")
+    if 'keywords' in results and results['keywords']:
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            # Create word cloud
+            keywords_text = ' '.join([kw['keyword'] for kw in results['keywords'][:50]])
+            if keywords_text:
+                wordcloud = WordCloud(
+                    width=800,
+                    height=400,
+                    background_color='white',
+                    colormap='viridis'
+                ).generate(keywords_text)
+                fig, ax = plt.subplots(figsize=(10, 5))
+                ax.imshow(wordcloud, interpolation='bilinear')
+                ax.axis('off')
+                st.pyplot(fig)
+        with col2:
+            st.write("**Top Keywords:**")
+            for i, kw in enumerate(results['keywords'][:10]):
+                st.write(f"{i+1}. {kw['keyword']} ({kw['score']:.3f})")
+def display_articles(results):
+    """Display individual articles with summaries"""
+    st.subheader(f"📰 Articles ({len(results['articles'])})")
+    for i, article in enumerate(results['articles']):
+        with st.expander(f"📄 {article['title']}", expanded=(i < 3)):
+            col1, col2 = st.columns([3, 1])
+            with col1:
+                st.write(f"**Source:** {article['source']}")
+                if 'date' in article:
+                    st.write(f"**Date:** {article['date']}")
+                st.write(f"**URL:** {article.get('url', 'N/A')}")
+                # Sentiment
+                sentiment = article['sentiment']
+                sentiment_label = "Positive" if sentiment['compound'] > 0.1 else "Negative" if sentiment['compound'] < -0.1 else "Neutral"
+                sentiment_color = "sentiment-positive" if sentiment_label == "Positive" else "sentiment-negative" if sentiment_label == "Negative" else "sentiment-neutral"
+                st.markdown(f"**Sentiment:** <span class='{sentiment_color}'>{sentiment_label} ({sentiment['compound']:.3f})</span>", unsafe_allow_html=True)
+            with col2:
+                # Model-specific scores
+                st.write("**Model Scores:**")
+                if 'vader' in sentiment:
+                    st.write(f"VADER: {sentiment['vader']:.3f}")
+                if 'loughran_mcdonald' in sentiment:
+                    st.write(f"L&M: {sentiment['loughran_mcdonald']:.3f}")
+                if 'finbert' in sentiment:
+                    st.write(f"FinBERT: {sentiment['finbert']:.3f}")
+            # Summary
+            if 'summary' in article:
+                st.write("**Summary:**")
+                st.write(article['summary'])
+            # Multilingual summaries
+            if 'summaries' in article:
+                for lang, summary in article['summaries'].items():
+                    if lang != 'English':
+                        st.write(f"**Summary ({lang}):**")
+                        st.write(summary)
+def display_sentiment_analysis(results):
+    """Display detailed sentiment analysis"""
+    st.subheader("🎯 Detailed Sentiment Analysis")
+    # Model comparison
+    if results['articles']:
+        model_data = []
+        for article in results['articles']:
+            sentiment = article['sentiment']
+            row = {'title': article['title'][:30] + "..."}
+            if 'vader' in sentiment:
+                row['VADER'] = sentiment['vader']
+            if 'loughran_mcdonald' in sentiment:
+                row['Loughran-McDonald'] = sentiment['loughran_mcdonald']
+            if 'finbert' in sentiment:
+                row['FinBERT'] = sentiment['finbert']
+            row['Final Score'] = sentiment['compound']
+            model_data.append(row)
+        df_models = pd.DataFrame(model_data)
+        st.write("**Model Comparison:**")
+        st.dataframe(df_models, use_container_width=True)
+        # Correlation heatmap
+        numeric_cols = [col for col in df_models.columns if col != 'title']
+        if len(numeric_cols) > 1:
+            corr_matrix = df_models[numeric_cols].corr()
+            fig_heatmap = px.imshow(
+                corr_matrix,
+                text_auto=True,
+                aspect="auto",
+                color_continuous_scale="RdBu_r",
+                color_continuous_midpoint=0
+            )
+            fig_heatmap.update_layout(title="Model Correlation Matrix")
+            st.plotly_chart(fig_heatmap, use_container_width=True)
+    # Top positive and negative articles
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write("**Most Positive Articles:**")
+        positive_articles = sorted(
+            results['articles'],
+            key=lambda x: x['sentiment']['compound'],
+            reverse=True
+        )[:5]
+        for article in positive_articles:
+            st.write(f"• {article['title'][:50]}... ({article['sentiment']['compound']:.3f})")
+    with col2:
+        st.write("**Most Negative Articles:**")
+        negative_articles = sorted(
+            results['articles'],
+            key=lambda x: x['sentiment']['compound']
+        )[:5]
+        for article in negative_articles:
+            st.write(f"• {article['title'][:50]}... ({article['sentiment']['compound']:.3f})")
+def display_audio_summaries(results):
+    """Display audio summaries for different languages"""
+    st.subheader("🎵 Audio Summaries")
+    if 'audio_files' in results:
+        for lang, audio_file in results['audio_files'].items():
+            st.write(f"**{lang} Summary:**")
+            # Create audio player
+            if os.path.exists(audio_file):
+                with open(audio_file, 'rb') as audio_file_obj:
+                    audio_bytes = audio_file_obj.read()
+                    st.audio(audio_bytes, format='audio/mp3')
+            else:
+                st.write("Audio file not found")
+    else:
+        st.info("No audio summaries available. Enable audio generation in settings.")
+def display_export_options(results):
+    """Display export options"""
+    st.subheader("📤 Export Results")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        # CSV Export
+        if st.button("📊 Download CSV", use_container_width=True):
+            csv_data = prepare_csv_export(results)
+            st.download_button(
+                label="Click to Download CSV",
+                data=csv_data,
+                file_name=f"news_analysis_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
+                mime="text/csv"
+            )
+    with col2:
+        # JSON Export
+        if st.button("📋 Download JSON", use_container_width=True):
+            json_data = json.dumps(results, indent=2, default=str)
+            st.download_button(
+                label="Click to Download JSON",
+                data=json_data,
+                file_name=f"news_analysis_{datetime.now().strftime('%Y%m%d_%H%M')}.json",
+                mime="application/json"
+            )
+    with col3:
+        # PDF Report
+        if st.button("📄 Generate PDF Report", use_container_width=True):
+            try:
+                pdf_buffer = generate_pdf_report(results)
+                st.download_button(
+                    label="Click to Download PDF",
+                    data=pdf_buffer,
+                    file_name=f"news_analysis_report_{datetime.now().strftime('%Y%m%d_%H%M')}.pdf",
+                    mime="application/pdf"
+                )
+            except Exception as e:
+                st.error(f"Error generating PDF: {str(e)}")
+def display_api_info(results):
+    """Display API information and examples"""
+    st.subheader("🔌 API Access")
+    st.write("**Endpoint:** `/api/analyze`")
+    st.write("**Method:** GET")
+    st.write("**Parameters:**")
+    st.code("""
+    - query: string (required) - Company name, ticker, or keyword
+    - num_articles: integer (default: 20) - Number of articles to analyze
+    - languages: array (default: ["English"]) - Summary languages
+    - include_audio: boolean (default: true) - Generate audio summaries
+    - sentiment_models: array (default: ["VADER", "Loughran-McDonald", "FinBERT"]) - Models to use
+    """)
+    st.write("**Example Request:**")
+    st.code(f"GET /api/analyze?query={results['query']}&num_articles=20")
+    st.write("**Sample Response:**")
+    sample_response = {
+        "query": results['query'],
+        "total_articles": len(results['articles']),
+        "average_sentiment": results['summary']['average_sentiment'],
+        "articles": results['articles'][:2]  # Show first 2 articles as example
+    }
+    st.json(sample_response)
+def prepare_csv_export(results):
+    """Prepare CSV data for export"""
+    csv_data = []
+    for article in results['articles']:
+        row = {
+            'title': article['title'],
+            'source': article['source'],
+            'url': article.get('url', ''),
+            'date': article.get('date', ''),
+            'sentiment_compound': article['sentiment']['compound'],
+            'sentiment_label': 'Positive' if article['sentiment']['compound'] > 0.1 else 'Negative' if article['sentiment']['compound'] < -0.1 else 'Neutral',
+            'summary': article.get('summary', '')
+        }
+        # Add model-specific scores
+        if 'vader' in article['sentiment']:
+            row['vader_score'] = article['sentiment']['vader']
+        if 'loughran_mcdonald' in article['sentiment']:
+            row['loughran_mcdonald_score'] = article['sentiment']['loughran_mcdonald']
+        if 'finbert' in article['sentiment']:
+            row['finbert_score'] = article['sentiment']['finbert']
+        csv_data.append(row)
+    df = pd.DataFrame(csv_data)
+    return df.to_csv(index=False)
+def show_demo_dashboard():
+    """Show demo dashboard with sample data"""
+    st.header("🚀 Welcome to Global Business News Intelligence")
+    st.markdown("""
+    ### Key Features:
+    - **🔍 Multi-Source News Scraping:** Aggregates news from reliable sources
+    - **🎯 Advanced Sentiment Analysis:** Uses VADER, Loughran-McDonald, and FinBERT models
+    - **🌐 Multilingual Support:** Summaries in English, Hindi, and Tamil
+    - **🎵 Audio Generation:** Text-to-speech for all language summaries
+    - **📊 Interactive Dashboard:** Real-time charts and visualizations
+    - **📤 Multiple Export Formats:** CSV, JSON, and PDF reports
+    - **🔌 API Access:** Programmatic access to all features
+    ### Use Cases:
+    - **📈 Investment Research:** Track sentiment around stocks and companies
+    - **🏢 Brand Monitoring:** Monitor public perception of your brand
+    - **🔍 Market Intelligence:** Stay informed about industry trends
+    - **📰 Media Analysis:** Analyze coverage patterns across sources
+    - **🌍 Global Insights:** Access news in multiple languages
+    ### Get Started:
+    1. Enter a company name, stock ticker, or keyword in the sidebar
+    2. Configure your analysis settings
+    3. Click "Analyze News" to start
+    4. Explore results in the interactive dashboard
+    5. Export your findings in multiple formats
+    """)
+    # Sample visualization
+    st.subheader("📊 Sample Analysis Dashboard")
+    # Create sample data
+    sample_data = {
+        'Sentiment': ['Positive', 'Negative', 'Neutral'],
+        'Count': [45, 15, 40]
+    }
+    fig = px.pie(
+        values=sample_data['Count'],
+        names=sample_data['Sentiment'],
+        color_discrete_map={'Positive': '#28a745', 'Negative': '#dc3545', 'Neutral': '#6c757d'},
+        title="Sample Sentiment Distribution"
+    )
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.plotly_chart(fig, use_container_width=True)
+    with col2:
+        st.write("**Sample Metrics:**")
+        st.metric("Articles Analyzed", "100")
+        st.metric("Average Sentiment", "0.234")
+        st.metric("Sources Covered", "15")
+        st.metric("Languages", "3")
+if __name__ == "__main__":
+    main()

summarizer_module.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import logging
+from typing import List, Optional
+import re
+from transformers import pipeline, AutoTokenizer
+import torch
+logger = logging.getLogger(__name__)
+class TextSummarizer:
+    """Text summarization with chunking for long documents"""
+    def __init__(self):
+        self.summarizer = None
+        self.tokenizer = None
+        self.max_chunk_length = 1024  # Maximum tokens per chunk
+        self.max_summary_length = 150
+        self.min_summary_length = 50
+        self._initialize_model()
+        logger.info("TextSummarizer initialized")
+    def _initialize_model(self):
+        """Initialize the summarization model"""
+        try:
+            # Try different models in order of preference
+            model_names = [
+                "facebook/bart-large-cnn",
+                "sshleifer/distilbart-cnn-12-6",
+                "t5-small"
+            ]
+            for model_name in model_names:
+                try:
+                    # Use CPU to avoid memory issues on Hugging Face Spaces
+                    device = -1  # CPU only for Hugging Face Spaces
+                    self.summarizer = pipeline(
+                        "summarization",
+                        model=model_name,
+                        tokenizer=model_name,
+                        device=device,
+                        framework="pt"
+                    )
+                    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+                    logger.info(f"Successfully loaded summarization model: {model_name}")
+                    break
+                except Exception as e:
+                    logger.warning(f"Failed to load {model_name}: {str(e)}")
+                    continue
+            if self.summarizer is None:
+                logger.error("Failed to load any summarization model")
+        except Exception as e:
+            logger.error(f"Error initializing summarizer: {str(e)}")
+    def summarize(self, text: str, max_length: int = None, min_length: int = None) -> str:
+        """Summarize text with automatic chunking for long documents"""
+        if not text or not text.strip():
+            return ""
+        if not self.summarizer:
+            return self._fallback_summarize(text)
+        try:
+            # Use provided lengths or defaults
+            max_len = max_length or self.max_summary_length
+            min_len = min_length or self.min_summary_length
+            # Check if text needs chunking
+            if self._needs_chunking(text):
+                return self._summarize_long_text(text, max_len, min_len)
+            else:
+                return self._summarize_chunk(text, max_len, min_len)
+        except Exception as e:
+            logger.error(f"Summarization failed: {str(e)}")
+            return self._fallback_summarize(text)
+    def _needs_chunking(self, text: str) -> bool:
+        """Check if text needs to be chunked"""
+        if not self.tokenizer:
+            return len(text.split()) > 300  # Rough word count threshold
+        try:
+            tokens = self.tokenizer.encode(text, add_special_tokens=True)
+            return len(tokens) > self.max_chunk_length
+        except:
+            return len(text.split()) > 300
+    def _summarize_long_text(self, text: str, max_len: int, min_len: int) -> str:
+        """Summarize long text by chunking"""
+        try:
+            # Split text into chunks
+            chunks = self._split_into_chunks(text)
+            if not chunks:
+                return self._fallback_summarize(text)
+            # Summarize each chunk
+            chunk_summaries = []
+            for chunk in chunks:
+                if len(chunk.strip()) > 100:  # Only summarize substantial chunks
+                    summary = self._summarize_chunk(
+                        chunk,
+                        max_length=min(max_len // len(chunks) + 20, 100),
+                        min_length=20
+                    )
+                    if summary and summary.strip():
+                        chunk_summaries.append(summary)
+            if not chunk_summaries:
+                return self._fallback_summarize(text)
+            # Combine chunk summaries
+            combined_summary = " ".join(chunk_summaries)
+            # If combined summary is still too long, summarize again
+            if self._needs_chunking(combined_summary) and len(chunk_summaries) > 1:
+                final_summary = self._summarize_chunk(combined_summary, max_len, min_len)
+                return final_summary if final_summary else combined_summary
+            return combined_summary
+        except Exception as e:
+            logger.error(f"Long text summarization failed: {str(e)}")
+            return self._fallback_summarize(text)
+    def _summarize_chunk(self, text: str, max_length: int, min_length: int) -> str:
+        """Summarize a single chunk of text"""
+        try:
+            if not text or len(text.strip()) < 50:
+                return text
+            # Clean text
+            cleaned_text = self._clean_text_for_summarization(text)
+            if not cleaned_text:
+                return text[:200] + "..." if len(text) > 200 else text
+            # Generate summary
+            result = self.summarizer(
+                cleaned_text,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=False,
+                truncation=True
+            )
+            if result and len(result) > 0 and 'summary_text' in result[0]:
+                summary = result[0]['summary_text'].strip()
+                # Post-process summary
+                summary = self._post_process_summary(summary)
+                return summary if summary else cleaned_text[:200] + "..."
+            return cleaned_text[:200] + "..."
+        except Exception as e:
+            logger.error(f"Chunk summarization failed: {str(e)}")
+            return text[:200] + "..." if len(text) > 200 else text
+    def _split_into_chunks(self, text: str) -> List[str]:
+        """Split text into manageable chunks"""
+        try:
+            # Split by paragraphs first
+            paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
+            if not paragraphs:
+                paragraphs = [text]
+            chunks = []
+            current_chunk = ""
+            current_length = 0
+            for paragraph in paragraphs:
+                paragraph_length = len(paragraph.split())
+                # If adding this paragraph would exceed chunk size, start new chunk
+                if current_length + paragraph_length > 250 and current_chunk:
+                    chunks.append(current_chunk.strip())
+                    current_chunk = paragraph
+                    current_length = paragraph_length
+                else:
+                    if current_chunk:
+                        current_chunk += "\n\n" + paragraph
+                    else:
+                        current_chunk = paragraph
+                    current_length += paragraph_length
+            # Add remaining chunk
+            if current_chunk.strip():
+                chunks.append(current_chunk.strip())
+            # If no proper chunks, split by sentences
+            if not chunks or len(chunks) == 1 and len(chunks[0].split()) > 400:
+                return self._split_by_sentences(text)
+            return chunks
+        except Exception as e:
+            logger.error(f"Text splitting failed: {str(e)}")
+            return [text]
+    def _split_by_sentences(self, text: str) -> List[str]:
+        """Split text by sentences as fallback"""
+        try:
+            sentences = re.split(r'[.!?]+\s+', text)
+            chunks = []
+            current_chunk = ""
+            for sentence in sentences:
+                if len((current_chunk + " " + sentence).split()) > 200:
+                    if current_chunk:
+                        chunks.append(current_chunk.strip())
+                    current_chunk = sentence
+                else:
+                    if current_chunk:
+                        current_chunk += ". " + sentence
+                    else:
+                        current_chunk = sentence
+            if current_chunk.strip():
+                chunks.append(current_chunk.strip())
+            return chunks if chunks else [text]
+        except Exception as e:
+            logger.error(f"Sentence splitting failed: {str(e)}")
+            return [text]
+    def _clean_text_for_summarization(self, text: str) -> str:
+        """Clean text for better summarization"""
+        if not text:
+            return ""
+        # Remove URLs
+        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
+        # Remove email addresses
+        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
+        # Remove excessive whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove common news artifacts
+        artifacts = [
+            r'\(Reuters\)', r'\(AP\)', r'\(Bloomberg\)', r'\(CNN\)',
+            r'-- .*$', r'Photo:.*$', r'Image:.*$', r'Video:.*$',
+            r'Subscribe.*$', r'Follow us.*$'
+        ]
+        for artifact in artifacts:
+            text = re.sub(artifact, '', text, flags=re.IGNORECASE | re.MULTILINE)
+        return text.strip()
+    def _post_process_summary(self, summary: str) -> str:
+        """Post-process generated summary"""
+        if not summary:
+            return ""
+        # Remove incomplete sentences at the end
+        sentences = re.split(r'[.!?]+', summary)
+        if len(sentences) > 1 and len(sentences[-1].strip()) < 10:
+            summary = '.'.join(sentences[:-1]) + '.'
+        # Capitalize first letter
+        summary = summary[0].upper() + summary[1:] if len(summary) > 1 else summary.upper()
+        # Ensure summary ends with punctuation
+        if summary and summary[-1] not in '.!?':
+            summary += '.'
+        return summary.strip()
+    def _fallback_summarize(self, text: str) -> str:
+        """Fallback summarization using simple extraction"""
+        try:
+            if not text or len(text.strip()) < 50:
+                return text
+            # Split into sentences
+            sentences = re.split(r'[.!?]+', text)
+            sentences = [s.strip() for s in sentences if s.strip() and len(s.split()) > 5]
+            if not sentences:
+                return text[:200] + "..." if len(text) > 200 else text
+            # Take first few sentences (extractive summary)
+            num_sentences = min(3, len(sentences))
+            summary_sentences = sentences[:num_sentences]
+            summary = '. '.join(summary_sentences)
+            if not summary.endswith('.'):
+                summary += '.'
+            # If summary is too long, truncate
+            if len(summary) > 300:
+                words = summary.split()
+                summary = ' '.join(words[:40]) + '...'
+            return summary
+        except Exception as e:
+            logger.error(f"Fallback summarization failed: {str(e)}")
+            return text[:200] + "..." if len(text) > 200 else text
+    def batch_summarize(self, texts: List[str], **kwargs) -> List[str]:
+        """Summarize multiple texts"""
+        summaries = []
+        for text in texts:
+            try:
+                summary = self.summarize(text, **kwargs)
+                summaries.append(summary)
+            except Exception as e:
+                logger.error(f"Batch summarization failed for one text: {str(e)}")
+                summaries.append(self._fallback_summarize(text))
+        return summaries
+    def get_summary_stats(self, original_text: str, summary: str) -> dict:
+        """Get statistics about the summarization"""
+        try:
+            original_words = len(original_text.split())
+            summary_words = len(summary.split())
+            compression_ratio = summary_words / original_words if original_words > 0 else 0
+            return {
+                'original_length': original_words,
+                'summary_length': summary_words,
+                'compression_ratio': compression_ratio,
+                'compression_percentage': (1 - compression_ratio) * 100
+            }
+        except Exception as e:
+            logger.error(f"Error calculating summary stats: {str(e)}")
+            return {
+                'original_length': 0,
+                'summary_length': 0,
+                'compression_ratio': 0,
+                'compression_percentage': 0
+            }
+# Utility functions
+def extract_key_sentences(text: str, num_sentences: int = 3) -> List[str]:
+    """Extract key sentences using simple heuristics"""
+    try:
+        sentences = re.split(r'[.!?]+', text)
+        sentences = [s.strip() for s in sentences if s.strip() and len(s.split()) > 5]
+        if not sentences:
+            return []
+        # Score sentences based on position and keyword density
+        scored_sentences = []
+        for i, sentence in enumerate(sentences):
+            score = 0
+            # Position bonus (earlier sentences get higher scores)
+            if i < len(sentences) * 0.3:
+                score += 3
+            elif i < len(sentences) * 0.6:
+                score += 2
+            else:
+                score += 1
+            # Length bonus (medium-length sentences preferred)
+            words = len(sentence.split())
+            if 10 <= words <= 25:
+                score += 2
+            elif 5 <= words <= 35:
+                score += 1
+            # Keyword bonus (sentences with common business/finance terms)
+            keywords = [
+                'company', 'business', 'revenue', 'profit', 'growth', 'market',
+                'financial', 'earnings', 'investment', 'stock', 'shares', 'economy'
+            ]
+            sentence_lower = sentence.lower()
+            keyword_count = sum(1 for keyword in keywords if keyword in sentence_lower)
+            score += keyword_count
+            scored_sentences.append((sentence, score))
+        # Sort by score and return top sentences
+        scored_sentences.sort(key=lambda x: x[1], reverse=True)
+        return [sent[0] for sent in scored_sentences[:num_sentences]]
+    except Exception as e:
+        logger.error(f"Key sentence extraction failed: {str(e)}")
+        return []

translator_module (1).py ADDED Viewed

	@@ -0,0 +1,336 @@

+import logging
+from typing import Dict, List, Optional
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+import re
+logger = logging.getLogger(__name__)
+class MultilingualTranslator:
+    """Multilingual translation with support for Hindi and Tamil"""
+    def __init__(self):
+        self.translators = {}
+        self.language_codes = {
+            'Hindi': 'hi',
+            'Tamil': 'ta',
+            'English': 'en'
+        }
+        # Supported translation pairs
+        self.supported_pairs = {
+            'en-hi': 'Helsinki-NLP/opus-mt-en-hi',
+            'en-ta': 'Helsinki-NLP/opus-mt-en-mul',  # Multilingual model for Tamil
+            'hi-en': 'Helsinki-NLP/opus-mt-hi-en',
+            'ta-en': 'Helsinki-NLP/opus-mt-mul-en'
+        }
+        self._initialize_models()
+        logger.info("MultilingualTranslator initialized")
+    def _initialize_models(self):
+        """Initialize translation models on-demand"""
+        # Don't load all models at startup to save memory
+        # They will be loaded when first needed
+        logger.info("Translation models will be loaded on-demand")
+    def _load_translator(self, source_lang: str, target_lang: str) -> Optional[object]:
+        """Load a specific translator model"""
+        pair_key = f"{source_lang}-{target_lang}"
+        if pair_key in self.translators:
+            return self.translators[pair_key]
+        try:
+            model_name = self.supported_pairs.get(pair_key)
+            if not model_name:
+                logger.error(f"No model available for {source_lang} -> {target_lang}")
+                return None
+            # Use CPU for Hugging Face Spaces compatibility
+            device = -1  # CPU only
+            translator = pipeline(
+                "translation",
+                model=model_name,
+                device=device,
+                framework="pt"
+            )
+            self.translators[pair_key] = translator
+            logger.info(f"Loaded translator for {source_lang} -> {target_lang}")
+            return translator
+        except Exception as e:
+            logger.error(f"Failed to load translator {pair_key}: {str(e)}")
+            return None
+    def translate(self, text: str, target_lang: str, source_lang: str = 'English') -> str:
+        """Translate text to target language"""
+        if not text or not text.strip():
+            return ""
+        # Get language codes
+        source_code = self.language_codes.get(source_lang, 'en')
+        target_code = self.language_codes.get(target_lang, target_lang.lower()[:2])
+        # If source and target are the same, return original text
+        if source_code == target_code:
+            return text
+        try:
+            # Load the appropriate translator
+            translator = self._load_translator(source_code, target_code)
+            if not translator:
+                return self._fallback_translate(text, target_lang)
+            # Clean and prepare text
+            cleaned_text = self._prepare_text_for_translation(text)
+            if not cleaned_text:
+                return text
+            # Split long text into chunks for translation
+            if len(cleaned_text.split()) > 200:
+                return self._translate_long_text(cleaned_text, translator)
+            else:
+                return self._translate_chunk(cleaned_text, translator)
+        except Exception as e:
+            logger.error(f"Translation failed: {str(e)}")
+            return self._fallback_translate(text, target_lang)
+    def _translate_chunk(self, text: str, translator) -> str:
+        """Translate a single chunk of text"""
+        try:
+            result = translator(text, max_length=512)
+            if result and len(result) > 0:
+                translated = result[0].get('translation_text', text)
+                return self._post_process_translation(translated)
+            return text
+        except Exception as e:
+            logger.error(f"Chunk translation failed: {str(e)}")
+            return text
+    def _translate_long_text(self, text: str, translator) -> str:
+        """Translate long text by splitting into chunks"""
+        try:
+            # Split by sentences
+            sentences = self._split_into_sentences(text)
+            if not sentences:
+                return text
+            translated_sentences = []
+            current_chunk = ""
+            for sentence in sentences:
+                # If adding this sentence would make chunk too long, translate current chunk
+                if len((current_chunk + " " + sentence).split()) > 150 and current_chunk:
+                    translated = self._translate_chunk(current_chunk, translator)
+                    translated_sentences.append(translated)
+                    current_chunk = sentence
+                else:
+                    if current_chunk:
+                        current_chunk += " " + sentence
+                    else:
+                        current_chunk = sentence
+            # Translate remaining chunk
+            if current_chunk:
+                translated = self._translate_chunk(current_chunk, translator)
+                translated_sentences.append(translated)
+            return " ".join(translated_sentences)
+        except Exception as e:
+            logger.error(f"Long text translation failed: {str(e)}")
+            return text
+    def _split_into_sentences(self, text: str) -> List[str]:
+        """Split text into sentences"""
+        try:
+            # Simple sentence splitting
+            sentences = re.split(r'[.!?]+\s+', text)
+            sentences = [s.strip() for s in sentences if s.strip()]
+            return sentences
+        except Exception as e:
+            logger.error(f"Sentence splitting failed: {str(e)}")
+            return [text]
+    def _prepare_text_for_translation(self, text: str) -> str:
+        """Prepare text for translation"""
+        if not text:
+            return ""
+        # Remove URLs
+        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
+        # Remove email addresses
+        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
+        # Clean excessive whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove special characters that might cause issues
+        text = re.sub(r'[^\w\s.,!?;:\-\'"()/%$]', '', text)
+        return text.strip()
+    def _post_process_translation(self, text: str) -> str:
+        """Post-process translated text"""
+        if not text:
+            return ""
+        # Clean up extra spaces
+        text = re.sub(r'\s+', ' ', text)
+        # Capitalize first letter if it's a sentence
+        if text and len(text) > 1:
+            text = text[0].upper() + text[1:]
+        return text.strip()
+    def _fallback_translate(self, text: str, target_lang: str) -> str:
+        """Fallback translation with basic text processing"""
+        logger.warning(f"Using fallback translation for {target_lang}")
+        # For demonstration purposes, we'll return the original text with a note
+        # In a production system, you might use a different translation service
+        if target_lang.lower() in ['hindi', 'hi']:
+            return f"[Hindi] {text}"
+        elif target_lang.lower() in ['tamil', 'ta']:
+            return f"[Tamil] {text}"
+        else:
+            return text
+    def batch_translate(self, texts: List[str], target_lang: str, source_lang: str = 'English') -> List[str]:
+        """Translate multiple texts"""
+        translations = []
+        for text in texts:
+            try:
+                translation = self.translate(text, target_lang, source_lang)
+                translations.append(translation)
+            except Exception as e:
+                logger.error(f"Batch translation failed for one text: {str(e)}")
+                translations.append(self._fallback_translate(text, target_lang))
+        return translations
+    def detect_language(self, text: str) -> str:
+        """Simple language detection (basic implementation)"""
+        try:
+            # Basic detection using character patterns
+            if not text:
+                return 'en'
+            # Check for Devanagari script (Hindi)
+            if re.search(r'[\u0900-\u097F]', text):
+                return 'hi'
+            # Check for Tamil script
+            if re.search(r'[\u0B80-\u0BFF]', text):
+                return 'ta'
+            # Default to English
+            return 'en'
+        except Exception as e:
+            logger.error(f"Language detection failed: {str(e)}")
+            return 'en'
+    def get_supported_languages(self) -> List[str]:
+        """Get list of supported languages"""
+        return list(self.language_codes.keys())
+    def is_translation_available(self, source_lang: str, target_lang: str) -> bool:
+        """Check if translation is available between two languages"""
+        source_code = self.language_codes.get(source_lang, source_lang.lower()[:2])
+        target_code = self.language_codes.get(target_lang, target_lang.lower()[:2])
+        pair_key = f"{source_code}-{target_code}"
+        return pair_key in self.supported_pairs
+    def translate_with_confidence(self, text: str, target_lang: str, source_lang: str = 'English') -> Dict[str, any]:
+        """Translate text and return result with confidence metrics"""
+        try:
+            translated_text = self.translate(text, target_lang, source_lang)
+            # Simple confidence calculation based on text characteristics
+            confidence = self._calculate_translation_confidence(text, translated_text, target_lang)
+            return {
+                'original_text': text,
+                'translated_text': translated_text,
+                'source_language': source_lang,
+                'target_language': target_lang,
+                'confidence': confidence,
+                'method': 'neural_translation' if translated_text != text else 'fallback'
+            }
+        except Exception as e:
+            logger.error(f"Translation with confidence failed: {str(e)}")
+            return {
+                'original_text': text,
+                'translated_text': text,
+                'source_language': source_lang,
+                'target_language': target_lang,
+                'confidence': 0.0,
+                'method': 'error',
+                'error': str(e)
+            }
+    def _calculate_translation_confidence(self, original: str, translated: str, target_lang: str) -> float:
+        """Calculate a simple confidence score for translation"""
+        try:
+            # If translation failed (same as original), low confidence
+            if original == translated and target_lang != 'English':
+                return 0.2
+            # If text is very short, moderate confidence
+            if len(original.split()) < 5:
+                return 0.7
+            # If translation is significantly different in length, lower confidence
+            original_len = len(original.split())
+            translated_len = len(translated.split())
+            length_ratio = min(original_len, translated_len) / max(original_len, translated_len)
+            if length_ratio < 0.5:
+                return 0.6
+            elif length_ratio < 0.7:
+                return 0.8
+            else:
+                return 0.9
+        except Exception as e:
+            logger.error(f"Confidence calculation failed: {str(e)}")
+            return 0.5
+# Utility functions
+def get_language_name(code: str) -> str:
+    """Get full language name from code"""
+    code_to_name = {
+        'en': 'English',
+        'hi': 'Hindi',
+        'ta': 'Tamil'
+    }
+    return code_to_name.get(code.lower(), code)
+def get_language_code(name: str) -> str:
+    """Get language code from name"""
+    name_to_code = {
+        'english': 'en',
+        'hindi': 'hi',
+        'tamil': 'ta'
+    }
+    return name_to_code.get(name.lower(), name.lower()[:2])

tts_module.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import logging
+import os
+import tempfile
+from typing import Dict, List, Optional
+import hashlib
+from datetime import datetime
+# gTTS for text-to-speech
+try:
+    from gtts import gTTS
+    GTTS_AVAILABLE = True
+except ImportError:
+    GTTS_AVAILABLE = False
+logger = logging.getLogger(__name__)
+class AudioGenerator:
+    """Text-to-speech audio generation with multilingual support"""
+    def __init__(self):
+        self.supported_languages = {
+            'English': 'en',
+            'Hindi': 'hi',
+            'Tamil': 'ta'
+        }
+        # Audio cache directory
+        self.cache_dir = tempfile.mkdtemp(prefix='news_audio_')
+        self.audio_cache = {}
+        logger.info(f"AudioGenerator initialized with cache directory: {self.cache_dir}")
+        if not GTTS_AVAILABLE:
+            logger.warning("gTTS not available. Audio generation will be limited.")
+    def generate_audio(self, text: str, language: str = 'English', output_file: str = None) -> Optional[str]:
+        """Generate audio from text"""
+        if not text or not text.strip():
+            logger.warning("Empty text provided for audio generation")
+            return None
+        if not GTTS_AVAILABLE:
+            logger.error("gTTS not available for audio generation")
+            return None
+        try:
+            # Get language code
+            lang_code = self.supported_languages.get(language, 'en')
+            # Create cache key
+            cache_key = self._create_cache_key(text, language)
+            # Check cache first
+            if cache_key in self.audio_cache:
+                cached_file = self.audio_cache[cache_key]
+                if os.path.exists(cached_file):
+                    logger.info(f"Using cached audio for {language}")
+                    return cached_file
+            # Generate output filename if not provided
+            if not output_file:
+                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                output_file = os.path.join(self.cache_dir, f"audio_{lang_code}_{timestamp}.mp3")
+            elif not os.path.dirname(output_file):
+                output_file = os.path.join(self.cache_dir, output_file)
+            # Prepare text for TTS
+            clean_text = self._prepare_text_for_tts(text)
+            if not clean_text:
+                logger.warning("No valid text for TTS after cleaning")
+                return None
+            # Generate audio using gTTS
+            if lang_code in ['en', 'hi']:
+                # gTTS supports English and Hindi directly
+                tts = gTTS(text=clean_text, lang=lang_code, slow=False)
+            elif lang_code == 'ta':
+                # For Tamil, use English as fallback or try Tamil if available
+                try:
+                    tts = gTTS(text=clean_text, lang='ta', slow=False)
+                except:
+                    logger.warning("Tamil not supported in gTTS, using English")
+                    tts = gTTS(text=clean_text, lang='en', slow=False)
+            else:
+                # Default to English
+                tts = gTTS(text=clean_text, lang='en', slow=False)
+            # Save audio file
+            tts.save(output_file)
+            # Verify file was created
+            if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
+                # Cache the result
+                self.audio_cache[cache_key] = output_file
+                logger.info(f"Audio generated successfully: {output_file}")
+                return output_file
+            else:
+                logger.error("Audio file was not created or is empty")
+                return None
+        except Exception as e:
+            logger.error(f"Audio generation failed: {str(e)}")
+            return None
+    def _create_cache_key(self, text: str, language: str) -> str:
+        """Create a cache key for the text and language combination"""
+        try:
+            combined = f"{text[:500]}_{language}"  # Use first 500 chars to avoid very long keys
+            return hashlib.md5(combined.encode()).hexdigest()
+        except Exception as e:
+            logger.error(f"Cache key creation failed: {str(e)}")
+            return f"default_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+    def _prepare_text_for_tts(self, text: str) -> str:
+        """Prepare text for text-to-speech conversion"""
+        if not text:
+            return ""
+        # Remove or replace problematic characters
+        import re
+        # Remove URLs
+        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
+        # Remove email addresses
+        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
+        # Replace multiple spaces with single space
+        text = re.sub(r'\s+', ' ', text)
+        # Remove excessive punctuation
+        text = re.sub(r'[.]{3,}', '...', text)
+        text = re.sub(r'[!]{2,}', '!', text)
+        text = re.sub(r'[?]{2,}', '?', text)
+        # Remove parenthetical citations and references
+        text = re.sub(r'\([^)]*\)', '', text)
+        text = re.sub(r'\[[^\]]*\]', '', text)
+        # Limit text length for TTS (gTTS has limits)
+        max_length = 5000  # Characters
+        if len(text) > max_length:
+            # Try to cut at sentence boundary
+            sentences = re.split(r'[.!?]+', text[:max_length])
+            if len(sentences) > 1:
+                text = '. '.join(sentences[:-1]) + '.'
+            else:
+                text = text[:max_length] + '...'
+        return text.strip()
+    def generate_batch_audio(self, texts: Dict[str, str], language: str = 'English') -> Dict[str, str]:
+        """Generate audio for multiple texts"""
+        results = {}
+        for key, text in texts.items():
+            try:
+                output_file = f"audio_{key}_{language.lower()}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
+                audio_file = self.generate_audio(text, language, output_file)
+                results[key] = audio_file
+            except Exception as e:
+                logger.error(f"Batch audio generation failed for {key}: {str(e)}")
+                results[key] = None
+        return results
+    def generate_summary_audio(self, articles: List[Dict], languages: List[str] = None) -> Dict[str, str]:
+        """Generate audio summaries for articles in multiple languages"""
+        if languages is None:
+            languages = ['English']
+        audio_files = {}
+        try:
+            # Create overall summary text
+            summary_text = self._create_audio_summary(articles)
+            if not summary_text:
+                logger.warning("No summary text created for audio")
+                return audio_files
+            # Generate audio for each language
+            for language in languages:
+                if language in self.supported_languages:
+                    try:
+                        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                        output_file = f"summary_{language.lower()}_{timestamp}.mp3"
+                        audio_file = self.generate_audio(summary_text, language, output_file)
+                        if audio_file:
+                            audio_files[language] = audio_file
+                        else:
+                            logger.warning(f"Failed to generate audio for {language}")
+                    except Exception as e:
+                        logger.error(f"Audio generation failed for {language}: {str(e)}")
+                        continue
+                else:
+                    logger.warning(f"Language {language} not supported for audio")
+            return audio_files
+        except Exception as e:
+            logger.error(f"Summary audio generation failed: {str(e)}")
+            return audio_files
+    def _create_audio_summary(self, articles: List[Dict]) -> str:
+        """Create a comprehensive audio summary from articles"""
+        try:
+            if not articles:
+                return ""
+            # Calculate sentiment distribution
+            positive_count = sum(1 for article in articles if article.get('sentiment', {}).get('compound', 0) > 0.1)
+            negative_count = sum(1 for article in articles if article.get('sentiment', {}).get('compound', 0) < -0.1)
+            neutral_count = len(articles) - positive_count - negative_count
+            # Start building summary
+            summary_parts = []
+            # Opening
+            summary_parts.append(f"News analysis summary for {len(articles)} articles.")
+            # Sentiment overview
+            if positive_count > negative_count:
+                summary_parts.append(f"Overall sentiment is predominantly positive, with {positive_count} positive articles, {negative_count} negative, and {neutral_count} neutral.")
+            elif negative_count > positive_count:
+                summary_parts.append(f"Overall sentiment is predominantly negative, with {negative_count} negative articles, {positive_count} positive, and {neutral_count} neutral.")
+            else:
+                summary_parts.append(f"Sentiment is mixed with balanced coverage across {positive_count} positive, {negative_count} negative, and {neutral_count} neutral articles.")
+            # Top stories
+            # Most positive story
+            positive_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0), reverse=True)
+            if positive_articles and positive_articles[0].get('sentiment', {}).get('compound', 0) > 0.1:
+                top_positive = positive_articles[0]
+                summary_parts.append(f"Most positive coverage: {top_positive.get('title', '')[:100]}")
+            # Most negative story
+            negative_articles = sorted(articles, key=lambda x: x.get('sentiment', {}).get('compound', 0))
+            if negative_articles and negative_articles[0].get('sentiment', {}).get('compound', 0) < -0.1:
+                top_negative = negative_articles[0]
+                summary_parts.append(f"Most concerning coverage: {top_negative.get('title', '')[:100]}")
+            # Recent developments (if we have dates)
+            recent_articles = [a for a in articles if a.get('date')]
+            if recent_articles:
+                recent_articles.sort(key=lambda x: x.get('date', ''), reverse=True)
+                if len(recent_articles) > 0:
+                    summary_parts.append(f"Latest development: {recent_articles[0].get('title', '')[:100]}")
+            # Closing
+            summary_parts.append("This concludes the news analysis summary.")
+            # Join all parts
+            full_summary = " ".join(summary_parts)
+            # Ensure reasonable length
+            if len(full_summary) > 1000:
+                # Truncate to first few sentences
+                sentences = full_summary.split('. ')
+                truncated = '. '.join(sentences[:8]) + '.'
+                return truncated
+            return full_summary
+        except Exception as e:
+            logger.error(f"Audio summary creation failed: {str(e)}")
+            return f"Analysis complete for {len(articles)} articles with mixed sentiment coverage."
+    def cleanup_cache(self, max_age_hours: int = 24):
+        """Clean up old audio files from cache"""
+        try:
+            if not os.path.exists(self.cache_dir):
+                return
+            current_time = datetime.now().timestamp()
+            max_age_seconds = max_age_hours * 3600
+            removed_count = 0
+            for filename in os.listdir(self.cache_dir):
+                filepath = os.path.join(self.cache_dir, filename)
+                if os.path.isfile(filepath):
+                    file_age = current_time - os.path.getmtime(filepath)
+                    if file_age > max_age_seconds:
+                        try:
+                            os.remove(filepath)
+                            removed_count += 1
+                            # Remove from cache dict as well
+                            cache_keys_to_remove = [k for k, v in self.audio_cache.items() if v == filepath]
+                            for key in cache_keys_to_remove:
+                                del self.audio_cache[key]
+                        except Exception as e:
+                            logger.error(f"Failed to remove old audio file {filepath}: {str(e)}")
+            if removed_count > 0:
+                logger.info(f"Cleaned up {removed_count} old audio files")
+        except Exception as e:
+            logger.error(f"Cache cleanup failed: {str(e)}")
+    def get_cache_info(self) -> Dict[str, any]:
+        """Get information about the audio cache"""
+        try:
+            cache_info = {
+                'cache_directory': self.cache_dir,
+                'cached_files': len(self.audio_cache),
+                'supported_languages': list(self.supported_languages.keys()),
+                'gtts_available': GTTS_AVAILABLE
+            }
+            if os.path.exists(self.cache_dir):
+                files = [f for f in os.listdir(self.cache_dir) if f.endswith('.mp3')]
+                cache_info['physical_files'] = len(files)
+                total_size = sum(os.path.getsize(os.path.join(self.cache_dir, f)) for f in files)
+                cache_info['total_size_bytes'] = total_size
+                cache_info['total_size_mb'] = round(total_size / (1024 * 1024), 2)
+            return cache_info
+        except Exception as e:
+            logger.error(f"Cache info retrieval failed: {str(e)}")
+            return {'error': str(e)}
+    def is_language_supported(self, language: str) -> bool:
+        """Check if a language is supported for audio generation"""
+        return language in self.supported_languages and GTTS_AVAILABLE

utils_module (1).py ADDED Viewed

	@@ -0,0 +1,442 @@

+import logging
+import os
+import json
+import pickle
+import hashlib
+from datetime import datetime, timedelta
+from typing import Dict, Any, Optional, List
+import tempfile
+import sys
+def setup_logging():
+    """Setup logging configuration"""
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.StreamHandler(sys.stdout),
+            logging.FileHandler('news_analyzer.log')
+        ]
+    )
+    # Reduce noise from transformers and other libraries
+    logging.getLogger("transformers").setLevel(logging.WARNING)
+    logging.getLogger("urllib3").setLevel(logging.WARNING)
+    logging.getLogger("requests").setLevel(logging.WARNING)
+def load_config() -> Dict[str, Any]:
+    """Load application configuration"""
+    default_config = {
+        'max_articles': 50,
+        'cache_ttl_hours': 6,
+        'supported_languages': ['English', 'Hindi', 'Tamil'],
+        'sentiment_models': ['VADER', 'Loughran-McDonald', 'FinBERT'],
+        'summarization_max_length': 150,
+        'summarization_min_length': 50,
+        'audio_enabled': True,
+        'translation_enabled': True,
+        'keyword_extraction_enabled': True,
+        'max_keywords': 20,
+        'debug_mode': False
+    }
+    # Try to load config from file if it exists
+    config_file = 'config.json'
+    if os.path.exists(config_file):
+        try:
+            with open(config_file, 'r') as f:
+                file_config = json.load(f)
+                default_config.update(file_config)
+        except Exception as e:
+            logging.error(f"Failed to load config file: {str(e)}")
+    return default_config
+class CacheManager:
+    """Simple file-based caching system"""
+    def __init__(self, cache_dir: str = None):
+        self.cache_dir = cache_dir or tempfile.mkdtemp(prefix='news_cache_')
+        self.ensure_cache_dir()
+        logging.info(f"Cache manager initialized with directory: {self.cache_dir}")
+    def ensure_cache_dir(self):
+        """Ensure cache directory exists"""
+        try:
+            os.makedirs(self.cache_dir, exist_ok=True)
+        except Exception as e:
+            logging.error(f"Failed to create cache directory: {str(e)}")
+    def _get_cache_key(self, key: str) -> str:
+        """Generate a safe cache key"""
+        return hashlib.md5(key.encode()).hexdigest()
+    def get(self, key: str, ttl_hours: int = 6) -> Optional[Any]:
+        """Get item from cache"""
+        try:
+            cache_key = self._get_cache_key(key)
+            cache_file = os.path.join(self.cache_dir, f"{cache_key}.pkl")
+            if not os.path.exists(cache_file):
+                return None
+            # Check if cache is expired
+            file_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
+            if file_age > ttl_hours * 3600:
+                try:
+                    os.remove(cache_file)
+                except:
+                    pass
+                return None
+            # Load cached data
+            with open(cache_file, 'rb') as f:
+                data = pickle.load(f)
+            logging.debug(f"Cache hit for key: {key[:50]}...")
+            return data
+        except Exception as e:
+            logging.error(f"Cache get failed for key {key}: {str(e)}")
+            return None
+    def set(self, key: str, value: Any) -> bool:
+        """Set item in cache"""
+        try:
+            cache_key = self._get_cache_key(key)
+            cache_file = os.path.join(self.cache_dir, f"{cache_key}.pkl")
+            with open(cache_file, 'wb') as f:
+                pickle.dump(value, f)
+            logging.debug(f"Cache set for key: {key[:50]}...")
+            return True
+        except Exception as e:
+            logging.error(f"Cache set failed for key {key}: {str(e)}")
+            return False
+    def clear_expired(self, ttl_hours: int = 24):
+        """Clear expired cache entries"""
+        try:
+            current_time = datetime.now().timestamp()
+            max_age = ttl_hours * 3600
+            cleared_count = 0
+            for filename in os.listdir(self.cache_dir):
+                if filename.endswith('.pkl'):
+                    filepath = os.path.join(self.cache_dir, filename)
+                    file_age = current_time - os.path.getmtime(filepath)
+                    if file_age > max_age:
+                        try:
+                            os.remove(filepath)
+                            cleared_count += 1
+                        except Exception as e:
+                            logging.error(f"Failed to remove cache file {filepath}: {str(e)}")
+            if cleared_count > 0:
+                logging.info(f"Cleared {cleared_count} expired cache entries")
+        except Exception as e:
+            logging.error(f"Cache cleanup failed: {str(e)}")
+# Global cache instance
+cache_manager = CacheManager()
+def cache_results(func):
+    """Decorator for caching function results"""
+    def wrapper(*args, **kwargs):
+        # Create cache key from function name and arguments
+        cache_key = f"{func.__name__}_{str(args)}_{str(kwargs)}"
+        # Try to get from cache
+        cached_result = cache_manager.get(cache_key)
+        if cached_result is not None:
+            return cached_result
+        # Execute function and cache result
+        result = func(*args, **kwargs)
+        cache_manager.set(cache_key, result)
+        return result
+    return wrapper
+def validate_input(text: str, min_length: int = 10, max_length: int = 10000) -> bool:
+    """Validate input text"""
+    if not text or not isinstance(text, str):
+        return False
+    text = text.strip()
+    if len(text) < min_length or len(text) > max_length:
+        return False
+    return True
+def sanitize_filename(filename: str) -> str:
+    """Sanitize filename for safe file system usage"""
+    import re
+    # Replace invalid characters
+    sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
+    # Remove extra spaces and dots
+    sanitized = re.sub(r'\s+', '_', sanitized)
+    sanitized = re.sub(r'\.+', '.', sanitized)
+    # Limit length
+    if len(sanitized) > 200:
+        sanitized = sanitized[:200]
+    return sanitized
+def format_datetime(dt: datetime = None) -> str:
+    """Format datetime for display"""
+    if dt is None:
+        dt = datetime.now()
+    return dt.strftime("%Y-%m-%d %H:%M:%S")
+def calculate_processing_stats(start_time: datetime, num_articles: int) -> Dict[str, Any]:
+    """Calculate processing statistics"""
+    end_time = datetime.now()
+    processing_time = (end_time - start_time).total_seconds()
+    return {
+        'start_time': format_datetime(start_time),
+        'end_time': format_datetime(end_time),
+        'processing_time_seconds': processing_time,
+        'processing_time_formatted': f"{processing_time:.2f} seconds",
+        'articles_processed': num_articles,
+        'articles_per_second': round(num_articles / processing_time, 2) if processing_time > 0 else 0
+    }
+def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
+    """Split text into overlapping chunks"""
+    if len(text) <= chunk_size:
+        return [text]
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        # If this isn't the last chunk, try to break at a sentence boundary
+        if end < len(text):
+            # Look for sentence boundaries in the last 100 characters
+            last_part = text[end-100:end]
+            sentence_end = max(
+                last_part.rfind('.'),
+                last_part.rfind('!'),
+                last_part.rfind('?')
+            )
+            if sentence_end != -1:
+                end = end - 100 + sentence_end + 1
+        chunks.append(text[start:end].strip())
+        start = end - overlap
+    return [chunk for chunk in chunks if chunk.strip()]
+def extract_domain(url: str) -> str:
+    """Extract domain from URL"""
+    try:
+        from urllib.parse import urlparse
+        parsed = urlparse(url)
+        return parsed.netloc.replace('www.', '')
+    except Exception:
+        return 'unknown'
+def safe_divide(a: float, b: float, default: float = 0.0) -> float:
+    """Safely divide two numbers"""
+    try:
+        return a / b if b != 0 else default
+    except (TypeError, ZeroDivisionError):
+        return default
+def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
+    """Truncate text to specified length"""
+    if not text or len(text) <= max_length:
+        return text
+    return text[:max_length - len(suffix)] + suffix
+def get_file_size_mb(filepath: str) -> float:
+    """Get file size in MB"""
+    try:
+        size_bytes = os.path.getsize(filepath)
+        return round(size_bytes / (1024 * 1024), 2)
+    except Exception:
+        return 0.0
+def ensure_directory(directory: str):
+    """Ensure directory exists"""
+    try:
+        os.makedirs(directory, exist_ok=True)
+    except Exception as e:
+        logging.error(f"Failed to create directory {directory}: {str(e)}")
+def load_json_file(filepath: str) -> Optional[Dict]:
+    """Load JSON file safely"""
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except Exception as e:
+        logging.error(f"Failed to load JSON file {filepath}: {str(e)}")
+        return None
+def save_json_file(data: Dict, filepath: str) -> bool:
+    """Save data to JSON file safely"""
+    try:
+        ensure_directory(os.path.dirname(filepath))
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, default=str)
+        return True
+    except Exception as e:
+        logging.error(f"Failed to save JSON file {filepath}: {str(e)}")
+        return False
+def merge_dictionaries(*dicts) -> Dict:
+    """Merge multiple dictionaries"""
+    result = {}
+    for d in dicts:
+        if isinstance(d, dict):
+            result.update(d)
+    return result
+def get_system_info() -> Dict[str, Any]:
+    """Get basic system information"""
+    import platform
+    import psutil
+    try:
+        return {
+            'platform': platform.platform(),
+            'python_version': platform.python_version(),
+            'cpu_count': os.cpu_count(),
+            'memory_gb': round(psutil.virtual_memory().total / (1024**3), 2),
+            'available_memory_gb': round(psutil.virtual_memory().available / (1024**3), 2),
+            'disk_space_gb': round(psutil.disk_usage('/').total / (1024**3), 2)
+        }
+    except Exception as e:
+        logging.error(f"Failed to get system info: {str(e)}")
+        return {'error': str(e)}
+def format_number(num: float, precision: int = 2) -> str:
+    """Format number for display"""
+    try:
+        if abs(num) >= 1_000_000:
+            return f"{num / 1_000_000:.{precision}f}M"
+        elif abs(num) >= 1_000:
+            return f"{num / 1_000:.{precision}f}K"
+        else:
+            return f"{num:.{precision}f}"
+    except Exception:
+        return str(num)
+def calculate_sentiment_distribution(articles: List[Dict]) -> Dict[str, Any]:
+    """Calculate sentiment distribution statistics"""
+    try:
+        if not articles:
+            return {'positive': 0, 'negative': 0, 'neutral': 0, 'total': 0}
+        sentiments = []
+        for article in articles:
+            sentiment = article.get('sentiment', {})
+            compound = sentiment.get('compound', 0)
+            sentiments.append(compound)
+        positive_count = sum(1 for s in sentiments if s > 0.1)
+        negative_count = sum(1 for s in sentiments if s < -0.1)
+        neutral_count = len(sentiments) - positive_count - negative_count
+        avg_sentiment = sum(sentiments) / len(sentiments) if sentiments else 0
+        return {
+            'positive': positive_count,
+            'negative': negative_count,
+            'neutral': neutral_count,
+            'total': len(articles),
+            'average_sentiment': round(avg_sentiment, 3),
+            'positive_percentage': round((positive_count / len(articles)) * 100, 1),
+            'negative_percentage': round((negative_count / len(articles)) * 100, 1),
+            'neutral_percentage': round((neutral_count / len(articles)) * 100, 1)
+        }
+    except Exception as e:
+        logging.error(f"Sentiment distribution calculation failed: {str(e)}")
+        return {'positive': 0, 'negative': 0, 'neutral': 0, 'total': 0}
+def create_progress_callback(progress_container=None):
+    """Create a progress callback function for Streamlit"""
+    def callback(progress: int, status: str):
+        if progress_container:
+            try:
+                progress_container.progress(progress)
+                if hasattr(progress_container, 'text'):
+                    progress_container.text(status)
+            except Exception as e:
+                logging.error(f"Progress callback error: {str(e)}")
+        else:
+            logging.info(f"Progress: {progress}% - {status}")
+    return callback
+def validate_url(url: str) -> bool:
+    """Validate if string is a valid URL"""
+    import re
+    url_pattern = re.compile(
+        r'^https?://'  # http:// or https://
+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'  # domain...
+        r'localhost|'  # localhost...
+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
+        r'(?::\d+)?'  # optional port
+        r'(?:/?|[/?]\S+), re.IGNORECASE)
+    return url_pattern.match(url) is not None
+class PerformanceTimer:
+    """Context manager for timing operations"""
+    def __init__(self, operation_name: str = "Operation"):
+        self.operation_name = operation_name
+        self.start_time = None
+        self.end_time = None
+    def __enter__(self):
+        self.start_time = datetime.now()
+        logging.info(f"Starting {self.operation_name}")
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.end_time = datetime.now()
+        duration = (self.end_time - self.start_time).total_seconds()
+        logging.info(f"Completed {self.operation_name} in {duration:.2f} seconds")
+    @property
+    def duration(self) -> float:
+        if self.start_time and self.end_time:
+            return (self.end_time - self.start_time).total_seconds()
+        return 0.0
+def retry_operation(func, max_attempts: int = 3, delay: float = 1.0):
+    """Retry an operation with exponential backoff"""
+    import time
+    for attempt in range(max_attempts):
+        try:
+            return func()
+        except Exception as e:
+            if attempt == max_attempts - 1:
+                raise e
+            wait_time = delay * (2 ** attempt)
+            logging.warning(f"Attempt {attempt + 1} failed: {str(e)}. Retrying in {wait_time} seconds...")
+            time.sleep(wait_time)
+    return None