File size: 5,859 Bytes
64d5a00
d25b499
 
 
 
 
 
 
7832e21
 
64ffc8f
d25b499
64d5a00
d25b499
64d5a00
d25b499
 
 
64d5a00
d25b499
64d5a00
 
d25b499
 
dd3df57
 
 
d25b499
dd3df57
64ffc8f
64d5a00
d25b499
 
dd3df57
 
 
 
d25b499
dd3df57
64ffc8f
d25b499
64d5a00
d25b499
 
 
64d5a00
64ffc8f
 
 
d25b499
 
 
 
 
64ffc8f
c7f60fc
 
64ffc8f
 
c7f60fc
 
64d5a00
64ffc8f
c7f60fc
 
64ffc8f
 
64d5a00
c7f60fc
d25b499
 
64d5a00
 
7832e21
64ffc8f
 
d25b499
64d5a00
d25b499
64d5a00
d25b499
 
64d5a00
 
d25b499
64d5a00
 
 
628c80f
64ffc8f
628c80f
64d5a00
 
 
 
7832e21
64d5a00
64ffc8f
7c727fa
64d5a00
7c727fa
 
64d5a00
 
 
 
 
c7f60fc
64d5a00
 
c7f60fc
7c727fa
64d5a00
c7f60fc
 
64d5a00
 
 
 
c7f60fc
 
64d5a00
c7f60fc
 
 
64d5a00
7c727fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d25b499
64d5a00
d25b499
dd3df57
 
 
 
d25b499
dd3df57
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from typing import List

from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
)
from bs4 import BeautifulSoup
import requests

# ---------------------------------------------------------------------------
# Model identifiers – use your custom sentiment model hosted on Hugging Face
# ---------------------------------------------------------------------------
SENTIMENT_MODEL_ID = "LinkLinkWu/Stock_Analysis_Test_Ahamed"  # binary sentiment
NER_MODEL_ID = "dslim/bert-base-NER"

# ---------------------------------------------------------------------------
# Eager initialisation (singletons shared by the whole Streamlit session)
# ---------------------------------------------------------------------------
# Sentiment pipeline – returns one label with its score. We will *ignore* the
# numeric score down‑stream to satisfy the "no numbers" requirement.
sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=sentiment_model,
    tokenizer=sentiment_tokenizer,
)

# Named‑entity‑recognition pipeline (ORG extraction)
ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
ner_pipeline = pipeline(
    "ner",
    model=ner_model,
    tokenizer=ner_tokenizer,
    grouped_entities=True,
)

# ---------------------------------------------------------------------------
# Web‑scraping helper (Finviz)
# ---------------------------------------------------------------------------

def fetch_news(ticker: str) -> List[dict]:
    """Return at most 30 latest Finviz headlines for *ticker* ("title" & "link")."""
    try:
        url = f"https://finviz.com/quote.ashx?t={ticker}"
        headers = {
            "User-Agent": "Mozilla/5.0",
            "Accept": "text/html",
            "Accept-Language": "en-US,en;q=0.5",
            "Referer": "https://finviz.com/",
            "Connection": "keep-alive",
        }
        r = requests.get(url, headers=headers, timeout=10)
        if r.status_code != 200:
            return []

        soup = BeautifulSoup(r.text, "html.parser")
        if ticker.upper() not in (soup.title.text if soup.title else "").upper():
            return []  # possibly a redirect page

        table = soup.find(id="news-table")
        if table is None:
            return []

        headlines: List[dict] = []
        for row in table.find_all("tr")[:30]:
            link_tag = row.find("a")
            if link_tag:
                headlines.append({"title": link_tag.get_text(strip=True), "link": link_tag["href"]})
        return headlines
    except Exception:
        return []

# ---------------------------------------------------------------------------
# Sentiment helpers – binary classification, *no* numeric score exposed
# ---------------------------------------------------------------------------
_LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive"}  # adjust if model config differs


def analyze_sentiment(text: str, pipe=None) -> str:
    """Return **"Positive"** or **"Negative"** for a single headline.

    *Neutral* outputs (if ever returned by the model) are coerced to *Negative*.
    Numeric confidence scores are deliberately discarded to honour the
    "no numbers" requirement.
    """
    try:
        sentiment_pipe = pipe or sentiment_pipeline
        result = sentiment_pipe(text, truncation=True, return_all_scores=False)[0]
        raw_label = result.get("label", "").upper()
        label = _LABEL_MAP.get(raw_label, "Negative")  # default to Negative
        return label
    except Exception:
        return "Unknown"

# ---------------------------------------------------------------------------
# Aggregation – majority vote (Positive‑ratio) → binary label
# ---------------------------------------------------------------------------

_POS_RATIO_THRESHOLD = 0.6  # ≥60 % positives → overall Positive


def aggregate_sentiments(labels: List[str], pos_ratio_threshold: float = _POS_RATIO_THRESHOLD) -> str:
    """Combine individual headline labels into an overall binary sentiment.

    * If *Positive* proportion ≥ *pos_ratio_threshold* → *Positive*.
    * Otherwise → *Negative*.
    * Empty list → *Unknown*.
    """
    if not labels:
        return "Unknown"

    total = len(labels)
    positives = sum(1 for l in labels if l == "Positive")
    ratio = positives / total
    return "Positive" if ratio >= pos_ratio_threshold else "Negative"

# ---------------------------------------------------------------------------
# ORG‑entity extraction (ticker discovery)
# ---------------------------------------------------------------------------

def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[str]:
    """Extract up to *max_entities* unique ORG tokens (upper‑case, de‑hashed)."""
    try:
        ner_pipe = pipe or ner_pipeline
        entities = ner_pipe(text)
        orgs: List[str] = []
        for ent in entities:
            if ent.get("entity_group") == "ORG":
                token = ent["word"].replace("##", "").strip().upper()
                if token and token not in orgs:
                    orgs.append(token)
                if len(orgs) >= max_entities:
                    break
        return orgs
    except Exception:
        return []

# ---------------------------------------------------------------------------
# Public accessors (legacy compatibility)
# ---------------------------------------------------------------------------

def get_sentiment_pipeline():
    return sentiment_pipeline


def get_ner_pipeline():
    return ner_pipeline