Spaces:
Sleeping
Sleeping
from typing import List, Tuple | |
from transformers import ( | |
pipeline, | |
AutoTokenizer, | |
AutoModelForSequenceClassification, | |
AutoModelForTokenClassification, | |
) | |
from bs4 import BeautifulSoup | |
import requests | |
# --------------------------------------------------------------------------- | |
# Model identifiers | |
# --------------------------------------------------------------------------- | |
SENTIMENT_MODEL_ID = "ahmedrachid/FinancialBERT-Sentiment-Analysis" # returns: positive / neutral / negative | |
NER_MODEL_ID = "dslim/bert-base-NER" | |
# --------------------------------------------------------------------------- | |
# Eager initialisation of Hugging Face pipelines (shared across requests) | |
# --------------------------------------------------------------------------- | |
# Sentiment pipeline (binary decision will be made later) | |
sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID) | |
sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID) | |
sentiment_pipeline = pipeline( | |
"sentiment-analysis", | |
model=sentiment_model, | |
tokenizer=sentiment_tokenizer, | |
) | |
# Named‑entity‑recognition pipeline (ORG extraction) | |
ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID) | |
ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID) | |
ner_pipeline = pipeline( | |
"ner", | |
model=ner_model, | |
tokenizer=ner_tokenizer, | |
grouped_entities=True, | |
) | |
# --------------------------------------------------------------------------- | |
# Core functionality | |
# --------------------------------------------------------------------------- | |
def fetch_news(ticker: str) -> List[dict]: | |
"""Scrape *up to* 30 recent headlines from Finviz for a given *ticker*. | |
Returns a list of dictionaries with ``{"title": str, "link": str}`` or an | |
empty list on any error/edge‑case (e.g. anti‑scraping redirect). | |
""" | |
try: | |
url = f"https://finviz.com/quote.ashx?t={ticker}" | |
headers = { | |
"User-Agent": "Mozilla/5.0", | |
"Accept": "text/html", | |
"Accept-Language": "en-US,en;q=0.5", | |
"Referer": "https://finviz.com/", | |
"Connection": "keep-alive", | |
} | |
response = requests.get(url, headers=headers, timeout=10) | |
if response.status_code != 200: | |
return [] | |
soup = BeautifulSoup(response.text, "html.parser") | |
page_title = soup.title.text if soup.title else "" | |
if ticker.upper() not in page_title.upper(): | |
# Finviz sometimes redirects to a placeholder page if the ticker is unknown. | |
return [] | |
news_table = soup.find(id="news-table") | |
if news_table is None: | |
return [] | |
latest_news: List[dict] = [] | |
for row in news_table.find_all("tr")[:30]: # keep only the 30 most recent rows | |
link_tag = row.find("a") | |
if link_tag: | |
latest_news.append({ | |
"title": link_tag.get_text(strip=True), | |
"link": link_tag["href"], | |
}) | |
return latest_news | |
except Exception: | |
# swallow all exceptions and degrade gracefully | |
return [] | |
# --------------------------------------------------------------------------- | |
# Sentiment analysis helpers | |
# --------------------------------------------------------------------------- | |
# Raw labels coming from the FinancialBERT model | |
_POSITIVE = "positive" | |
_NEGATIVE = "negative" | |
_DEFAULT_THRESHOLD = 0.55 # default probability threshold; callers may override | |
def analyze_sentiment( | |
text: str, | |
pipe=None, | |
threshold: float = _DEFAULT_THRESHOLD, | |
) -> Tuple[str, float]: | |
"""Classify *text* as **Positive/Negative** and return its positive probability. | |
The underlying model is three‑class (positive/neutral/negative). We keep the | |
**positive** score only and compare it against *threshold* to obtain a binary | |
label. The function is **side‑effect free** and will never raise; on any | |
internal error it falls back to ``("Unknown", 0.0)``. | |
""" | |
try: | |
sentiment_pipe = pipe or sentiment_pipeline | |
raw_scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0] | |
score_lookup = {item["label"].lower(): item["score"] for item in raw_scores} | |
pos_score = score_lookup.get(_POSITIVE, 0.0) | |
label = "Positive" if pos_score >= threshold else "Negative" | |
return label, pos_score | |
except Exception: | |
return "Unknown", 0.0 | |
# --------------------------------------------------------------------------- | |
# Aggregation logic – turning many headlines into one overall label | |
# --------------------------------------------------------------------------- | |
def aggregate_sentiments( | |
results: List[Tuple[str, float]], | |
avg_threshold: float = _DEFAULT_THRESHOLD, | |
) -> str: | |
"""Combine individual headline results into a single overall label. | |
The rule is simple: compute the *mean* positive probability across all | |
headlines and compare it with *avg_threshold*. If the list is empty, the | |
function returns ``"Unknown"``. | |
""" | |
if not results: | |
return "Unknown" | |
avg_pos = sum(score for _, score in results) / len(results) | |
return "Positive" if avg_pos >= avg_threshold else "Negative" | |
# --------------------------------------------------------------------------- | |
# Public helpers (kept for backward compatibility with app.py) | |
# --------------------------------------------------------------------------- | |
def get_sentiment_pipeline(): | |
"""Expose the initialised sentiment pipeline (singleton).""" | |
return sentiment_pipeline | |
def get_ner_pipeline(): | |
"""Expose the initialised NER pipeline (singleton).""" | |
return ner_pipeline | |