Sze_Link_ISOM_5240_MODEL

Sleeping

App Files Files Community

Sze_Link_ISOM_5240_MODEL / func.py

LinkLinkWu

Update func.py

64d5a00 verified 3 months ago

raw

history blame

5.86 kB

	from typing import List

	from transformers import (
	pipeline,
	AutoTokenizer,
	AutoModelForSequenceClassification,
	AutoModelForTokenClassification,
	)
	from bs4 import BeautifulSoup
	import requests

	# ---------------------------------------------------------------------------
	# Model identifiers – use your custom sentiment model hosted on Hugging Face
	# ---------------------------------------------------------------------------
	SENTIMENT_MODEL_ID = "LinkLinkWu/Stock_Analysis_Test_Ahamed" # binary sentiment
	NER_MODEL_ID = "dslim/bert-base-NER"

	# ---------------------------------------------------------------------------
	# Eager initialisation (singletons shared by the whole Streamlit session)
	# ---------------------------------------------------------------------------
	# Sentiment pipeline – returns one label with its score. We will ignore the
	# numeric score down‑stream to satisfy the "no numbers" requirement.
	sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
	sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
	sentiment_pipeline = pipeline(
	"sentiment-analysis",
	model=sentiment_model,
	tokenizer=sentiment_tokenizer,
	)

	# Named‑entity‑recognition pipeline (ORG extraction)
	ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
	ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
	ner_pipeline = pipeline(
	"ner",
	model=ner_model,
	tokenizer=ner_tokenizer,
	grouped_entities=True,
	)

	# ---------------------------------------------------------------------------
	# Web‑scraping helper (Finviz)
	# ---------------------------------------------------------------------------

	def fetch_news(ticker: str) -> List[dict]:
	"""Return at most 30 latest Finviz headlines for ticker ("title" & "link")."""
	try:
	url = f"https://finviz.com/quote.ashx?t={ticker}"
	headers = {
	"User-Agent": "Mozilla/5.0",
	"Accept": "text/html",
	"Accept-Language": "en-US,en;q=0.5",
	"Referer": "https://finviz.com/",
	"Connection": "keep-alive",
	}
	r = requests.get(url, headers=headers, timeout=10)
	if r.status_code != 200:
	return []

	soup = BeautifulSoup(r.text, "html.parser")
	if ticker.upper() not in (soup.title.text if soup.title else "").upper():
	return [] # possibly a redirect page

	table = soup.find(id="news-table")
	if table is None:
	return []

	headlines: List[dict] = []
	for row in table.find_all("tr")[:30]:
	link_tag = row.find("a")
	if link_tag:
	headlines.append({"title": link_tag.get_text(strip=True), "link": link_tag["href"]})
	return headlines
	except Exception:
	return []

	# ---------------------------------------------------------------------------
	# Sentiment helpers – binary classification, no numeric score exposed
	# ---------------------------------------------------------------------------
	_LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive"} # adjust if model config differs


	def analyze_sentiment(text: str, pipe=None) -> str:
	"""Return "Positive" or "Negative" for a single headline.

	Neutral outputs (if ever returned by the model) are coerced to Negative.
	Numeric confidence scores are deliberately discarded to honour the
	"no numbers" requirement.
	"""
	try:
	sentiment_pipe = pipe or sentiment_pipeline
	result = sentiment_pipe(text, truncation=True, return_all_scores=False)[0]
	raw_label = result.get("label", "").upper()
	label = _LABEL_MAP.get(raw_label, "Negative") # default to Negative
	return label
	except Exception:
	return "Unknown"

	# ---------------------------------------------------------------------------
	# Aggregation – majority vote (Positive‑ratio) → binary label
	# ---------------------------------------------------------------------------

	_POS_RATIO_THRESHOLD = 0.6 # ≥60 % positives → overall Positive


	def aggregate_sentiments(labels: List[str], pos_ratio_threshold: float = _POS_RATIO_THRESHOLD) -> str:
	"""Combine individual headline labels into an overall binary sentiment.

	* If Positive proportion ≥ pos_ratio_threshold → Positive.
	* Otherwise → Negative.
	* Empty list → Unknown.
	"""
	if not labels:
	return "Unknown"

	total = len(labels)
	positives = sum(1 for l in labels if l == "Positive")
	ratio = positives / total
	return "Positive" if ratio >= pos_ratio_threshold else "Negative"

	# ---------------------------------------------------------------------------
	# ORG‑entity extraction (ticker discovery)
	# ---------------------------------------------------------------------------

	def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[str]:
	"""Extract up to max_entities unique ORG tokens (upper‑case, de‑hashed)."""
	try:
	ner_pipe = pipe or ner_pipeline
	entities = ner_pipe(text)
	orgs: List[str] = []
	for ent in entities:
	if ent.get("entity_group") == "ORG":
	token = ent["word"].replace("##", "").strip().upper()
	if token and token not in orgs:
	orgs.append(token)
	if len(orgs) >= max_entities:
	break
	return orgs
	except Exception:
	return []

	# ---------------------------------------------------------------------------
	# Public accessors (legacy compatibility)
	# ---------------------------------------------------------------------------

	def get_sentiment_pipeline():
	return sentiment_pipeline


	def get_ner_pipeline():
	return ner_pipeline