File size: 5,740 Bytes
d25b499
 
 
 
 
 
 
 
7832e21
 
64ffc8f
d25b499
 
 
 
 
 
 
 
 
 
 
 
dd3df57
 
 
d25b499
dd3df57
64ffc8f
d25b499
 
 
dd3df57
 
 
 
d25b499
dd3df57
64ffc8f
d25b499
 
 
 
 
 
 
 
 
 
64ffc8f
 
 
d25b499
 
 
 
 
64ffc8f
d25b499
64ffc8f
 
 
d25b499
 
 
 
64ffc8f
 
d25b499
64ffc8f
 
 
d25b499
 
 
 
 
 
 
 
 
7832e21
d25b499
64ffc8f
 
d25b499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628c80f
64ffc8f
628c80f
d25b499
 
 
 
 
7832e21
d25b499
64ffc8f
d25b499
 
 
 
 
 
 
 
 
 
 
 
 
8d9b985
d25b499
 
 
 
 
 
 
 
 
dd3df57
 
d25b499
dd3df57
 
d25b499
dd3df57
d25b499
dd3df57
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from typing import List, Tuple

from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
)
from bs4 import BeautifulSoup
import requests

# ---------------------------------------------------------------------------
# Model identifiers
# ---------------------------------------------------------------------------
SENTIMENT_MODEL_ID = "ahmedrachid/FinancialBERT-Sentiment-Analysis"  # returns: positive / neutral / negative
NER_MODEL_ID = "dslim/bert-base-NER"

# ---------------------------------------------------------------------------
# Eager initialisation of Hugging Face pipelines (shared across requests)
# ---------------------------------------------------------------------------
# Sentiment pipeline (binary decision will be made later)
sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=sentiment_model,
    tokenizer=sentiment_tokenizer,
)

# Named‑entity‑recognition pipeline (ORG extraction)
ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
ner_pipeline = pipeline(
    "ner",
    model=ner_model,
    tokenizer=ner_tokenizer,
    grouped_entities=True,
)

# ---------------------------------------------------------------------------
# Core functionality
# ---------------------------------------------------------------------------

def fetch_news(ticker: str) -> List[dict]:
    """Scrape *up to* 30 recent headlines from Finviz for a given *ticker*.

    Returns a list of dictionaries with ``{"title": str, "link": str}`` or an
    empty list on any error/edge‑case (e.g. anti‑scraping redirect).
    """
    try:
        url = f"https://finviz.com/quote.ashx?t={ticker}"
        headers = {
            "User-Agent": "Mozilla/5.0",
            "Accept": "text/html",
            "Accept-Language": "en-US,en;q=0.5",
            "Referer": "https://finviz.com/",
            "Connection": "keep-alive",
        }
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return []

        soup = BeautifulSoup(response.text, "html.parser")
        page_title = soup.title.text if soup.title else ""
        if ticker.upper() not in page_title.upper():
            # Finviz sometimes redirects to a placeholder page if the ticker is unknown.
            return []

        news_table = soup.find(id="news-table")
        if news_table is None:
            return []

        latest_news: List[dict] = []
        for row in news_table.find_all("tr")[:30]:  # keep only the 30 most recent rows
            link_tag = row.find("a")
            if link_tag:
                latest_news.append({
                    "title": link_tag.get_text(strip=True),
                    "link": link_tag["href"],
                })
        return latest_news
    except Exception:
        # swallow all exceptions and degrade gracefully
        return []

# ---------------------------------------------------------------------------
# Sentiment analysis helpers
# ---------------------------------------------------------------------------
# Raw labels coming from the FinancialBERT model
_POSITIVE = "positive"
_NEGATIVE = "negative"

_DEFAULT_THRESHOLD = 0.55  # default probability threshold; callers may override

def analyze_sentiment(
    text: str,
    pipe=None,
    threshold: float = _DEFAULT_THRESHOLD,
) -> Tuple[str, float]:
    """Classify *text* as **Positive/Negative** and return its positive probability.

    The underlying model is three‑class (positive/neutral/negative). We keep the
    **positive** score only and compare it against *threshold* to obtain a binary
    label. The function is **side‑effect free** and will never raise; on any
    internal error it falls back to ``("Unknown", 0.0)``.
    """
    try:
        sentiment_pipe = pipe or sentiment_pipeline
        raw_scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0]
        score_lookup = {item["label"].lower(): item["score"] for item in raw_scores}
        pos_score = score_lookup.get(_POSITIVE, 0.0)
        label = "Positive" if pos_score >= threshold else "Negative"
        return label, pos_score
    except Exception:
        return "Unknown", 0.0

# ---------------------------------------------------------------------------
# Aggregation logic – turning many headlines into one overall label
# ---------------------------------------------------------------------------

def aggregate_sentiments(
    results: List[Tuple[str, float]],
    avg_threshold: float = _DEFAULT_THRESHOLD,
) -> str:
    """Combine individual headline results into a single overall label.

    The rule is simple: compute the *mean* positive probability across all
    headlines and compare it with *avg_threshold*. If the list is empty, the
    function returns ``"Unknown"``.
    """
    if not results:
        return "Unknown"

    avg_pos = sum(score for _, score in results) / len(results)
    return "Positive" if avg_pos >= avg_threshold else "Negative"

# ---------------------------------------------------------------------------
# Public helpers (kept for backward compatibility with app.py)
# ---------------------------------------------------------------------------

def get_sentiment_pipeline():
    """Expose the initialised sentiment pipeline (singleton)."""
    return sentiment_pipeline


def get_ner_pipeline():
    """Expose the initialised NER pipeline (singleton)."""
    return ner_pipeline