Sze_Link_ISOM_5240_MODEL

Sleeping

File size: 5,740 Bytes

d25b499
 
 
 
 
 
 
 
7832e21
 
64ffc8f
d25b499
 
 
 
 
 
 
 
 
 
 
 
dd3df57
 
 
d25b499
dd3df57
64ffc8f
d25b499
 
 
dd3df57
 
 
 
d25b499
dd3df57
64ffc8f
d25b499
 
 
 
 
 
 
 
 
 
64ffc8f
 
 
d25b499
 
 
 
 
64ffc8f
d25b499
64ffc8f
 
 
d25b499
 
 
 
64ffc8f
 
d25b499
64ffc8f
 
 
d25b499
 
 
 
 
 
 
 
 
7832e21
d25b499
64ffc8f
 
d25b499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628c80f
64ffc8f
628c80f
d25b499
 
 
 
 
7832e21
d25b499
64ffc8f
d25b499
 
 
 
 
 
 
 
 
 
 
 
 
8d9b985
d25b499
 
 
 
 
 
 
 
 
dd3df57
 
d25b499
dd3df57
 
d25b499
dd3df57
d25b499
dd3df57

from typing import List, Tuple

from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
)
from bs4 import BeautifulSoup
import requests

# ---------------------------------------------------------------------------
# Model identifiers
# ---------------------------------------------------------------------------
SENTIMENT_MODEL_ID = "ahmedrachid/FinancialBERT-Sentiment-Analysis"  # returns: positive / neutral / negative
NER_MODEL_ID = "dslim/bert-base-NER"

# ---------------------------------------------------------------------------
# Eager initialisation of Hugging Face pipelines (shared across requests)
# ---------------------------------------------------------------------------
# Sentiment pipeline (binary decision will be made later)
sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=sentiment_model,
    tokenizer=sentiment_tokenizer,
)

# Named‑entity‑recognition pipeline (ORG extraction)
ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
ner_pipeline = pipeline(
    "ner",
    model=ner_model,
    tokenizer=ner_tokenizer,
    grouped_entities=True,
)

# ---------------------------------------------------------------------------
# Core functionality
# ---------------------------------------------------------------------------

def fetch_news(ticker: str) -> List[dict]:
    """Scrape *up to* 30 recent headlines from Finviz for a given *ticker*.

    Returns a list of dictionaries with ``{"title": str, "link": str}`` or an
    empty list on any error/edge‑case (e.g. anti‑scraping redirect).
    """
    try:
        url = f"https://finviz.com/quote.ashx?t={ticker}"
        headers = {
            "User-Agent": "Mozilla/5.0",
            "Accept": "text/html",
            "Accept-Language": "en-US,en;q=0.5",
            "Referer": "https://finviz.com/",
            "Connection": "keep-alive",
        }
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return []

        soup = BeautifulSoup(response.text, "html.parser")
        page_title = soup.title.text if soup.title else ""
        if ticker.upper() not in page_title.upper():
            # Finviz sometimes redirects to a placeholder page if the ticker is unknown.
            return []

        news_table = soup.find(id="news-table")
        if news_table is None:
            return []

        latest_news: List[dict] = []
        for row in news_table.find_all("tr")[:30]:  # keep only the 30 most recent rows
            link_tag = row.find("a")
            if link_tag:
                latest_news.append({
                    "title": link_tag.get_text(strip=True),
                    "link": link_tag["href"],
                })
        return latest_news
    except Exception:
        # swallow all exceptions and degrade gracefully
        return []

# ---------------------------------------------------------------------------
# Sentiment analysis helpers
# ---------------------------------------------------------------------------
# Raw labels coming from the FinancialBERT model
_POSITIVE = "positive"
_NEGATIVE = "negative"

_DEFAULT_THRESHOLD = 0.55  # default probability threshold; callers may override

def analyze_sentiment(
    text: str,
    pipe=None,
    threshold: float = _DEFAULT_THRESHOLD,
) -> Tuple[str, float]:
    """Classify *text* as **Positive/Negative** and return its positive probability.

    The underlying model is three‑class (positive/neutral/negative). We keep the
    **positive** score only and compare it against *threshold* to obtain a binary
    label. The function is **side‑effect free** and will never raise; on any
    internal error it falls back to ``("Unknown", 0.0)``.
    """
    try:
        sentiment_pipe = pipe or sentiment_pipeline
        raw_scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0]
        score_lookup = {item["label"].lower(): item["score"] for item in raw_scores}
        pos_score = score_lookup.get(_POSITIVE, 0.0)
        label = "Positive" if pos_score >= threshold else "Negative"
        return label, pos_score
    except Exception:
        return "Unknown", 0.0

# ---------------------------------------------------------------------------
# Aggregation logic – turning many headlines into one overall label
# ---------------------------------------------------------------------------

def aggregate_sentiments(
    results: List[Tuple[str, float]],
    avg_threshold: float = _DEFAULT_THRESHOLD,
) -> str:
    """Combine individual headline results into a single overall label.

    The rule is simple: compute the *mean* positive probability across all
    headlines and compare it with *avg_threshold*. If the list is empty, the
    function returns ``"Unknown"``.
    """
    if not results:
        return "Unknown"

    avg_pos = sum(score for _, score in results) / len(results)
    return "Positive" if avg_pos >= avg_threshold else "Negative"

# ---------------------------------------------------------------------------
# Public helpers (kept for backward compatibility with app.py)
# ---------------------------------------------------------------------------

def get_sentiment_pipeline():
    """Expose the initialised sentiment pipeline (singleton)."""
    return sentiment_pipeline


def get_ner_pipeline():
    """Expose the initialised NER pipeline (singleton)."""
    return ner_pipeline