|
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification |
|
from bs4 import BeautifulSoup |
|
import requests |
|
|
|
|
|
|
|
model_id = "LinkLinkWu/ISOM5240HKUSTBASE" |
|
sentiment_tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_id) |
|
sentiment_pipeline = pipeline( |
|
"sentiment-analysis", |
|
model=sentiment_model, |
|
tokenizer=sentiment_tokenizer |
|
) |
|
|
|
|
|
ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") |
|
ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER") |
|
ner_pipeline = pipeline( |
|
"ner", |
|
model=ner_model, |
|
tokenizer=ner_tokenizer, |
|
grouped_entities=True |
|
) |
|
|
|
|
|
def fetch_news(ticker): |
|
try: |
|
url = f"https://finviz.com/quote.ashx?t={ticker}" |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0', |
|
'Accept': 'text/html', |
|
'Accept-Language': 'en-US,en;q=0.5', |
|
'Referer': 'https://finviz.com/', |
|
'Connection': 'keep-alive', |
|
} |
|
response = requests.get(url, headers=headers) |
|
if response.status_code != 200: |
|
return [] |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
title = soup.title.text if soup.title else "" |
|
if ticker not in title: |
|
return [] |
|
|
|
news_table = soup.find(id='news-table') |
|
if news_table is None: |
|
return [] |
|
|
|
news = [] |
|
for row in news_table.findAll('tr')[:30]: |
|
a_tag = row.find('a') |
|
if a_tag: |
|
title_text = a_tag.get_text() |
|
link = a_tag['href'] |
|
news.append({'title': title_text, 'link': link}) |
|
return news |
|
except Exception: |
|
return [] |
|
|
|
def analyze_sentiment(text, pipe=None): |
|
""" |
|
兼容两种调用: |
|
- analyze_sentiment(text) -> 使用全局 sentiment_pipeline |
|
- analyze_sentiment(text, some_pipeline) -> 使用传入的 some_pipeline |
|
""" |
|
try: |
|
sentiment_pipe = pipe or sentiment_pipeline |
|
result = sentiment_pipe(text)[0] |
|
return "Positive" if result['label'] == 'POSITIVE' else "Negative" |
|
except Exception: |
|
return "Unknown" |
|
|
|
def extract_org_entities(text, pipe=None): |
|
""" |
|
- extract_org_entities(text) |
|
- extract_org_entities(text, some_pipeline) |
|
""" |
|
try: |
|
ner_pipe = pipe or ner_pipeline |
|
entities = ner_pipe(text) |
|
orgs = [] |
|
for ent in entities: |
|
if ent["entity_group"] == "ORG": |
|
w = ent["word"].replace("##", "").strip().upper() |
|
if w not in orgs: |
|
orgs.append(w) |
|
if len(orgs) >= 5: |
|
break |
|
return orgs |
|
except Exception: |
|
return [] |
|
|
|
|
|
def get_sentiment_pipeline(): |
|
return sentiment_pipeline |
|
|
|
def get_ner_pipeline(): |
|
return ner_pipeline |
|
|