Spaces:
Sleeping
Sleeping
File size: 5,859 Bytes
64d5a00 d25b499 7832e21 64ffc8f d25b499 64d5a00 d25b499 64d5a00 d25b499 64d5a00 d25b499 64d5a00 d25b499 dd3df57 d25b499 dd3df57 64ffc8f 64d5a00 d25b499 dd3df57 d25b499 dd3df57 64ffc8f d25b499 64d5a00 d25b499 64d5a00 64ffc8f d25b499 64ffc8f c7f60fc 64ffc8f c7f60fc 64d5a00 64ffc8f c7f60fc 64ffc8f 64d5a00 c7f60fc d25b499 64d5a00 7832e21 64ffc8f d25b499 64d5a00 d25b499 64d5a00 d25b499 64d5a00 d25b499 64d5a00 628c80f 64ffc8f 628c80f 64d5a00 7832e21 64d5a00 64ffc8f 7c727fa 64d5a00 7c727fa 64d5a00 c7f60fc 64d5a00 c7f60fc 7c727fa 64d5a00 c7f60fc 64d5a00 c7f60fc 64d5a00 c7f60fc 64d5a00 7c727fa d25b499 64d5a00 d25b499 dd3df57 d25b499 dd3df57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
from typing import List
from transformers import (
pipeline,
AutoTokenizer,
AutoModelForSequenceClassification,
AutoModelForTokenClassification,
)
from bs4 import BeautifulSoup
import requests
# ---------------------------------------------------------------------------
# Model identifiers – use your custom sentiment model hosted on Hugging Face
# ---------------------------------------------------------------------------
SENTIMENT_MODEL_ID = "LinkLinkWu/Stock_Analysis_Test_Ahamed" # binary sentiment
NER_MODEL_ID = "dslim/bert-base-NER"
# ---------------------------------------------------------------------------
# Eager initialisation (singletons shared by the whole Streamlit session)
# ---------------------------------------------------------------------------
# Sentiment pipeline – returns one label with its score. We will *ignore* the
# numeric score down‑stream to satisfy the "no numbers" requirement.
sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
sentiment_pipeline = pipeline(
"sentiment-analysis",
model=sentiment_model,
tokenizer=sentiment_tokenizer,
)
# Named‑entity‑recognition pipeline (ORG extraction)
ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
ner_pipeline = pipeline(
"ner",
model=ner_model,
tokenizer=ner_tokenizer,
grouped_entities=True,
)
# ---------------------------------------------------------------------------
# Web‑scraping helper (Finviz)
# ---------------------------------------------------------------------------
def fetch_news(ticker: str) -> List[dict]:
"""Return at most 30 latest Finviz headlines for *ticker* ("title" & "link")."""
try:
url = f"https://finviz.com/quote.ashx?t={ticker}"
headers = {
"User-Agent": "Mozilla/5.0",
"Accept": "text/html",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://finviz.com/",
"Connection": "keep-alive",
}
r = requests.get(url, headers=headers, timeout=10)
if r.status_code != 200:
return []
soup = BeautifulSoup(r.text, "html.parser")
if ticker.upper() not in (soup.title.text if soup.title else "").upper():
return [] # possibly a redirect page
table = soup.find(id="news-table")
if table is None:
return []
headlines: List[dict] = []
for row in table.find_all("tr")[:30]:
link_tag = row.find("a")
if link_tag:
headlines.append({"title": link_tag.get_text(strip=True), "link": link_tag["href"]})
return headlines
except Exception:
return []
# ---------------------------------------------------------------------------
# Sentiment helpers – binary classification, *no* numeric score exposed
# ---------------------------------------------------------------------------
_LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive"} # adjust if model config differs
def analyze_sentiment(text: str, pipe=None) -> str:
"""Return **"Positive"** or **"Negative"** for a single headline.
*Neutral* outputs (if ever returned by the model) are coerced to *Negative*.
Numeric confidence scores are deliberately discarded to honour the
"no numbers" requirement.
"""
try:
sentiment_pipe = pipe or sentiment_pipeline
result = sentiment_pipe(text, truncation=True, return_all_scores=False)[0]
raw_label = result.get("label", "").upper()
label = _LABEL_MAP.get(raw_label, "Negative") # default to Negative
return label
except Exception:
return "Unknown"
# ---------------------------------------------------------------------------
# Aggregation – majority vote (Positive‑ratio) → binary label
# ---------------------------------------------------------------------------
_POS_RATIO_THRESHOLD = 0.6 # ≥60 % positives → overall Positive
def aggregate_sentiments(labels: List[str], pos_ratio_threshold: float = _POS_RATIO_THRESHOLD) -> str:
"""Combine individual headline labels into an overall binary sentiment.
* If *Positive* proportion ≥ *pos_ratio_threshold* → *Positive*.
* Otherwise → *Negative*.
* Empty list → *Unknown*.
"""
if not labels:
return "Unknown"
total = len(labels)
positives = sum(1 for l in labels if l == "Positive")
ratio = positives / total
return "Positive" if ratio >= pos_ratio_threshold else "Negative"
# ---------------------------------------------------------------------------
# ORG‑entity extraction (ticker discovery)
# ---------------------------------------------------------------------------
def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[str]:
"""Extract up to *max_entities* unique ORG tokens (upper‑case, de‑hashed)."""
try:
ner_pipe = pipe or ner_pipeline
entities = ner_pipe(text)
orgs: List[str] = []
for ent in entities:
if ent.get("entity_group") == "ORG":
token = ent["word"].replace("##", "").strip().upper()
if token and token not in orgs:
orgs.append(token)
if len(orgs) >= max_entities:
break
return orgs
except Exception:
return []
# ---------------------------------------------------------------------------
# Public accessors (legacy compatibility)
# ---------------------------------------------------------------------------
def get_sentiment_pipeline():
return sentiment_pipeline
def get_ner_pipeline():
return ner_pipeline
|