Szeyu commited on
Commit
78c1fff
·
verified ·
1 Parent(s): e7f6be9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -153
app.py CHANGED
@@ -1,161 +1,62 @@
1
  import streamlit as st
2
- import pandas as pd
3
- from sklearn.model_selection import train_test_split
4
- from datasets import Dataset
5
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, get_linear_schedule_with_warmup
6
- import numpy as np
7
- import torch
8
  from transformers import pipeline
9
- from collections import Counter
10
- import time
11
- from tqdm import tqdm
12
- import evaluate
13
 
14
- # Function to load and process data
15
- def load_and_process_data(news_file, trend_file):
16
- news_df = pd.read_csv(news_file)
17
- trend_df = pd.read_csv(trend_file)
18
- trend_df = trend_df.rename(columns={'Symbol': 'Stock'})
19
- news_labeled_df = news_df.merge(trend_df[['Stock', 'Trend']], on='Stock', how='left')
20
- news_labeled_df = news_labeled_df[news_labeled_df['Trend'].isin(['Positive', 'Negative'])]
21
- label_map = {'Negative': 0, 'Positive': 1}
22
- news_labeled_df['label'] = news_labeled_df['Trend'].map(label_map)
23
- return news_labeled_df
24
-
25
- # Function to check class imbalance
26
- def check_class_imbalance(df):
27
- class_counts = df['label'].value_counts()
28
- st.write("**Class Distribution:**", class_counts.to_dict())
29
- if class_counts.min() / class_counts.max() < 0.5:
30
- st.warning("Warning: Class imbalance detected. Consider balancing techniques.")
31
-
32
- # Function to split data
33
- def split_data(df):
34
- stocks = df['Stock'].unique()
35
- train_val_stocks, test_stocks = train_test_split(stocks, test_size=0.2, random_state=42)
36
- train_stocks, val_stocks = train_test_split(train_val_stocks, test_size=0.25, random_state=42)
37
- train_df = df[df['Stock'].isin(train_stocks)]
38
- val_df = df[df['Stock'].isin(val_stocks)]
39
- test_df = df[df['Stock'].isin(test_stocks)]
40
- return train_df, val_df, test_df
41
-
42
- # Function to tokenize datasets
43
- def tokenize_datasets(train_df, val_df, test_df, tokenizer):
44
- train_dataset = Dataset.from_pandas(train_df[['Headline', 'label']])
45
- val_dataset = Dataset.from_pandas(val_df[['Headline', 'label']])
46
- test_dataset = Dataset.from_pandas(test_df[['Headline', 'label']])
47
- def tokenize_function(examples):
48
- return tokenizer(examples['Headline'], padding='max_length', truncation=True, max_length=128)
49
- tokenized_train = train_dataset.map(tokenize_function, batched=True)
50
- tokenized_val = val_dataset.map(tokenize_function, batched=True)
51
- tokenized_test = test_dataset.map(tokenize_function, batched=True)
52
- return tokenized_train, tokenized_val, tokenized_test
53
-
54
- # Function to load model with caching
55
- @st.cache_resource
56
- def load_model():
57
- model = AutoModelForSequenceClassification.from_pretrained(
58
- "yiyanghkust/finbert-tone",
59
- num_labels=2,
60
- ignore_mismatched_sizes=True
61
- )
62
- for param in model.bert.encoder.layer[:6].parameters():
63
- param.requires_grad = False
64
- return model
65
-
66
- # Function to train model
67
- def train_model(tokenized_train, tokenized_val, model):
68
- training_args = TrainingArguments(
69
- output_dir="./results",
70
- num_train_epochs=5,
71
- per_device_train_batch_size=32,
72
- per_device_eval_batch_size=32,
73
- eval_strategy="epoch",
74
- save_strategy="epoch",
75
- load_best_model_at_end=True,
76
- metric_for_best_model="accuracy",
77
- learning_rate=5e-5,
78
- weight_decay=0.1,
79
- report_to="none",
80
- )
81
- total_steps = len(tokenized_train) // training_args.per_device_train_batch_size * training_args.num_train_epochs
82
- optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)
83
- trainer = Trainer(
84
- model=model,
85
- args=training_args,
86
- train_dataset=tokenized_train,
87
- eval_dataset=tokenized_val,
88
- compute_metrics=lambda eval_pred: {"accuracy": evaluate.load("accuracy").compute(predictions=np.argmax(eval_pred.predictions, axis=1), references=eval_pred.label_ids)},
89
- optimizers=(optimizer, get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)),
90
- )
91
- trainer.train()
92
- trainer.save_model("./fine_tuned_model")
93
- return trainer
94
-
95
- # Function to evaluate model
96
- def evaluate_model(pipe, df, model_name=""):
97
- results = []
98
- total_start = time.perf_counter()
99
- for stock, group in tqdm(df.groupby("Stock")):
100
- headlines = group["Headline"].tolist()
101
- true_trend = group["Trend"].iloc[0]
102
- try:
103
- preds = pipe(headlines, truncation=True)
104
- except Exception as e:
105
- st.error(f"Error for {stock}: {e}")
106
- continue
107
- labels = [p['label'] for p in preds]
108
- count = Counter(labels)
109
- num_pos, num_neg = count.get("Positive", 0), count.get("Negative", 0)
110
- predicted_trend = "Positive" if num_pos > num_neg else "Negative"
111
- match = predicted_trend == true_trend
112
- results.append(match)
113
- total_runtime = time.perf_counter() - total_start
114
- accuracy = sum(results) / len(results) if results else 0
115
- st.write(f"**🔍 Evaluation Summary for {model_name}**")
116
- st.write(f"✅ Accuracy: {accuracy:.2%}")
117
- st.write(f"⏱ Total Runtime: {total_runtime:.2f} seconds")
118
- return accuracy
119
 
120
  # Streamlit UI
121
- st.title("Financial Sentiment Analysis with FinBERT")
122
- st.markdown("Upload your CSV files to train and evaluate a sentiment analysis model on financial news headlines.")
123
 
124
- st.header("Upload CSV Files")
125
- news_file = st.file_uploader("Upload Train_stock_news.csv", type="csv")
126
- trend_file = st.file_uploader("Upload Training_price_comparison.csv", type="csv")
127
 
128
- if news_file and trend_file:
129
- with st.spinner("Processing data..."):
130
- df = load_and_process_data(news_file, trend_file)
131
- check_class_imbalance(df)
132
- train_df, val_df, test_df = split_data(df)
133
- st.write(f"**Training stocks:** {len(train_df['Stock'].unique())}")
134
- st.write(f"**Validation stocks:** {len(val_df['Stock'].unique())}")
135
- st.write(f"**Test stocks:** {len(test_df['Stock'].unique())}")
136
-
137
- tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
138
- tokenized_train, tokenized_val, tokenized_test = tokenize_datasets(train_df, val_df, test_df, tokenizer)
139
-
140
- model = load_model()
141
-
142
- with st.spinner("Training model..."):
143
- trainer = train_model(tokenized_train, tokenized_val, model)
144
-
145
- st.success("Model training completed!")
146
-
147
- # Evaluate original model
148
- original_pipe = pipeline("text-classification", model="yiyanghkust/finbert-tone")
149
- st.write("Evaluating original model...")
150
- original_accuracy = evaluate_model(original_pipe, test_df, model_name="Original Model")
151
-
152
- # Evaluate fine-tuned model
153
- fine_tuned_pipe = pipeline("text-classification", model="./fine_tuned_model")
154
- st.write("Evaluating fine-tuned model...")
155
- fine_tuned_accuracy = evaluate_model(fine_tuned_pipe, test_df, model_name="Fine-tuned Model")
156
 
157
- st.write(f"**Comparison:**")
158
- st.write(f"Original Model Accuracy: {original_accuracy:.2%}")
159
- st.write(f"Fine-tuned Model Accuracy: {fine_tuned_accuracy:.2%}")
160
- else:
161
- st.warning("Please upload both CSV files to proceed.")
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
 
 
 
 
4
  from transformers import pipeline
 
 
 
 
5
 
6
+ # Initialize sentiment analysis pipeline
7
+ sentiment_pipeline = pipeline("sentiment-analysis")
8
+
9
+ # Function to fetch top 3 news articles from FinViz
10
+ def fetch_news(ticker):
11
+ try:
12
+ url = f"https://finviz.com/quote.ashx?t={ticker}"
13
+ headers = {'User-Agent': 'Mozilla/5.0'}
14
+ response = requests.get(url, headers=headers)
15
+ soup = BeautifulSoup(response.text, 'html.parser')
16
+ news_table = soup.find(id='news-table')
17
+ news = []
18
+ for row in news_table.findAll('tr')[:3]: # Limit to top 3
19
+ title = row.a.get_text()
20
+ link = row.a['href']
21
+ news.append({'title': title, 'link': link})
22
+ return news
23
+ except Exception as e:
24
+ st.error(f"Failed to fetch news for {ticker}: {e}")
25
+ return []
26
+
27
+ # Function to analyze sentiment of news title
28
+ def analyze_sentiment(text):
29
+ try:
30
+ result = sentiment_pipeline(text)[0]
31
+ return "Positive" if result['label'] == 'POSITIVE' else "Negative"
32
+ except Exception as e:
33
+ st.error(f"Sentiment analysis failed: {e}")
34
+ return "Unknown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  # Streamlit UI
37
+ st.title("Stock News Sentiment Analysis")
 
38
 
39
+ # Input field for stock tickers
40
+ tickers_input = st.text_input("Enter five stock tickers separated by commas (e.g., AAPL, MSFT, GOOGL, AMZN, TSLA):")
 
41
 
42
+ if st.button("Get News and Sentiment"):
43
+ if tickers_input:
44
+ tickers = [ticker.strip().upper() for ticker in tickers_input.split(',')]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # Validate input
47
+ if len(tickers) != 5:
48
+ st.error("Please enter exactly five stock tickers.")
49
+ else:
50
+ # Process each ticker
51
+ for ticker in tickers:
52
+ st.subheader(f"Top 3 News Articles for {ticker}")
53
+ news_list = fetch_news(ticker)
54
+
55
+ if news_list:
56
+ for i, news in enumerate(news_list, 1):
57
+ sentiment = analyze_sentiment(news['title'])
58
+ st.markdown(f"{i}. [{news['title']}]({news['link']}) - **{sentiment}**")
59
+ else:
60
+ st.write("No news available for this ticker.")
61
+ else:
62
+ st.warning("Please enter stock tickers.")