Sze_Link_ISOM_5240_MODEL

Sleeping

App Files Files Community

Sze_Link_ISOM_5240_MODEL / func.py

LinkLinkWu

Update func.py

d25b499 verified 3 months ago

raw

history blame

5.74 kB

	from typing import List, Tuple

	from transformers import (
	pipeline,
	AutoTokenizer,
	AutoModelForSequenceClassification,
	AutoModelForTokenClassification,
	)
	from bs4 import BeautifulSoup
	import requests

	# ---------------------------------------------------------------------------
	# Model identifiers
	# ---------------------------------------------------------------------------
	SENTIMENT_MODEL_ID = "ahmedrachid/FinancialBERT-Sentiment-Analysis" # returns: positive / neutral / negative
	NER_MODEL_ID = "dslim/bert-base-NER"

	# ---------------------------------------------------------------------------
	# Eager initialisation of Hugging Face pipelines (shared across requests)
	# ---------------------------------------------------------------------------
	# Sentiment pipeline (binary decision will be made later)
	sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
	sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
	sentiment_pipeline = pipeline(
	"sentiment-analysis",
	model=sentiment_model,
	tokenizer=sentiment_tokenizer,
	)

	# Named‑entity‑recognition pipeline (ORG extraction)
	ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
	ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
	ner_pipeline = pipeline(
	"ner",
	model=ner_model,
	tokenizer=ner_tokenizer,
	grouped_entities=True,
	)

	# ---------------------------------------------------------------------------
	# Core functionality
	# ---------------------------------------------------------------------------

	def fetch_news(ticker: str) -> List[dict]:
	"""Scrape up to 30 recent headlines from Finviz for a given ticker.

	Returns a list of dictionaries with ``{"title": str, "link": str}`` or an
	empty list on any error/edge‑case (e.g. anti‑scraping redirect).
	"""
	try:
	url = f"https://finviz.com/quote.ashx?t={ticker}"
	headers = {
	"User-Agent": "Mozilla/5.0",
	"Accept": "text/html",
	"Accept-Language": "en-US,en;q=0.5",
	"Referer": "https://finviz.com/",
	"Connection": "keep-alive",
	}
	response = requests.get(url, headers=headers, timeout=10)
	if response.status_code != 200:
	return []

	soup = BeautifulSoup(response.text, "html.parser")
	page_title = soup.title.text if soup.title else ""
	if ticker.upper() not in page_title.upper():
	# Finviz sometimes redirects to a placeholder page if the ticker is unknown.
	return []

	news_table = soup.find(id="news-table")
	if news_table is None:
	return []

	latest_news: List[dict] = []
	for row in news_table.find_all("tr")[:30]: # keep only the 30 most recent rows
	link_tag = row.find("a")
	if link_tag:
	latest_news.append({
	"title": link_tag.get_text(strip=True),
	"link": link_tag["href"],
	})
	return latest_news
	except Exception:
	# swallow all exceptions and degrade gracefully
	return []

	# ---------------------------------------------------------------------------
	# Sentiment analysis helpers
	# ---------------------------------------------------------------------------
	# Raw labels coming from the FinancialBERT model
	_POSITIVE = "positive"
	_NEGATIVE = "negative"

	_DEFAULT_THRESHOLD = 0.55 # default probability threshold; callers may override

	def analyze_sentiment(
	text: str,
	pipe=None,
	threshold: float = _DEFAULT_THRESHOLD,
	) -> Tuple[str, float]:
	"""Classify text as Positive/Negative and return its positive probability.

	The underlying model is three‑class (positive/neutral/negative). We keep the
	positive score only and compare it against threshold to obtain a binary
	label. The function is side‑effect free and will never raise; on any
	internal error it falls back to ``("Unknown", 0.0)``.
	"""
	try:
	sentiment_pipe = pipe or sentiment_pipeline
	raw_scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0]
	score_lookup = {item["label"].lower(): item["score"] for item in raw_scores}
	pos_score = score_lookup.get(_POSITIVE, 0.0)
	label = "Positive" if pos_score >= threshold else "Negative"
	return label, pos_score
	except Exception:
	return "Unknown", 0.0

	# ---------------------------------------------------------------------------
	# Aggregation logic – turning many headlines into one overall label
	# ---------------------------------------------------------------------------

	def aggregate_sentiments(
	results: List[Tuple[str, float]],
	avg_threshold: float = _DEFAULT_THRESHOLD,
	) -> str:
	"""Combine individual headline results into a single overall label.

	The rule is simple: compute the mean positive probability across all
	headlines and compare it with avg_threshold. If the list is empty, the
	function returns ``"Unknown"``.
	"""
	if not results:
	return "Unknown"

	avg_pos = sum(score for _, score in results) / len(results)
	return "Positive" if avg_pos >= avg_threshold else "Negative"

	# ---------------------------------------------------------------------------
	# Public helpers (kept for backward compatibility with app.py)
	# ---------------------------------------------------------------------------

	def get_sentiment_pipeline():
	"""Expose the initialised sentiment pipeline (singleton)."""
	return sentiment_pipeline


	def get_ner_pipeline():
	"""Expose the initialised NER pipeline (singleton)."""
	return ner_pipeline