szeandlinkProject_Testing

Sleeping

App Files Files Community

Szeyu commited on May 15

Commit

6af8332

verified ·

1 Parent(s): 90ab0d9

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -121

app.py CHANGED Viewed

@@ -1,135 +1,161 @@
-import re
 import streamlit as st
-from transformers import pipeline
-import textwrap
 import numpy as np
-import soundfile as sf
-import tempfile
-import os
-from PIL import Image
-import string
-# Initialize pipelines with caching
-@st.cache_resource
-def load_pipelines():
-    captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
-    storyer = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
-    tts = pipeline("text-to-speech", model="facebook/mms-tts-eng")
-    return captioner, storyer, tts
-captioner, storyer, tts = load_pipelines()
-def clean_generated_story(raw_story: str) -> str:
-    """
-    Cleans the generated story by:
-    1. Removing URLs.
-    2. Removing digits.
-    3. Removing words likely to be random letter combinations based on having no vowels.
-    4. Removing single-letter words unless allowed (such as 'a' or 'I').
-    """
-    # Remove URLs starting with http://, https://, or www.
-    no_urls = re.sub(r'\b(?:https?://|www\.)\S+\b', '', raw_story)
-    # Remove domain names without protocol (e.g., erskybooks.com)
-    no_urls = re.sub(r'\b\w+\.(com|net|org|co\.uk|ca\.us|me)\b', '', no_urls)
-    # Remove all digits
-    story_without_numbers = re.sub(r'\d+', '', no_urls)
-    vowels = set('aeiouAEIOU')
-    def is_valid_word(word: str) -> bool:
-        # Allow "a" and "I" for single-letter words
-        if len(word) == 1 and word.lower() not in ['a', 'i']:
-            return False
-        # For words longer than one letter, filter out those that do not contain any vowels
-        if len(word) > 1 and not any(char in vowels for char in word):
-            return False
-        return True
-    # Split the cleaned text into words, filter them, and reassemble
-    words = story_without_numbers.split()
-    filtered_words = [word for word in words if is_valid_word(word)]
-    # Trim the cleaned story to the first 100 words (optional)
-    clean_story = " ".join(filtered_words[:100])
-    return clean_story
-def get_caption(image) -> str:
-    """
-    Takes an image and returns a generated caption.
-    """
-    pil_image = Image.open(image)
-    caption = captioner(pil_image)[0]["generated_text"]
-    st.write("**🌟 What's in the picture: 🌟**")
-    st.write(caption)
-    return caption
-def get_story(caption: str) -> str:
-    """
-    Takes a caption and returns a funny, bright, and playful story targeted toward young children.
-    """
-    prompt = (
-        f"Write a funny and playful story for young children precisely centered on this scene {caption}\nStory: "
-        f"mention the exact place and venue within {caption}. "
-        f"Make the story magical and exciting."
     )
-    raw = storyer(
-        prompt,
-        max_new_tokens=150,
-        temperature=0.7,
-        top_p=0.9,
-        no_repeat_ngram_size=2,
-        return_full_text=False
-    )[0]["generated_text"].strip()
-    story = clean_generated_story(raw)
-    st.write("**📖 Your funny story: 📖**")
-    st.write(story)
-    return story
-def generate_audio(story: str) -> str:
-    """
-    Converts the text story into speech audio and returns the file path for the audio.
-    """
-    chunks = textwrap.wrap(story, width=200)
-    audio = np.concatenate([tts(chunk)["audio"].squeeze() for chunk in chunks])
-    # Save the audio to a temporary file and return its path.
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
-        sf.write(temp_file.name, audio, tts.model.config.sampling_rate)
-        temp_file_path = temp_file.name
-    return temp_file_path
-def generate_content(image):
-    """
-    Pipeline function that:
-    - Generates a caption from the uploaded image.
-    - Uses the caption to generate a story.
-    - Converts the story to speech audio.
-    """
-    caption = get_caption(image)
-    story = get_story(caption)
-    audio_path = generate_audio(story)
-    return caption, story, audio_path
-# Streamlit UI section
-st.title("✨ Magic Story Maker ✨")
-st.markdown("Upload a picture to make a funny story and hear it too! 📸")
-uploaded_image = st.file_uploader("Choose your picture", type=["jpg", "jpeg", "png"])
-if uploaded_image is None:
-    st.image("https://example.com/placeholder_image.jpg", caption="Upload your picture here! 📷", use_container_width=True)
 else:
-    st.image(uploaded_image, caption="Your Picture 🌟", use_container_width=True)
-if st.button("✨ Make My Story! ✨"):
-    if uploaded_image is not None:
-        with st.spinner("🔮 Creating your magical story..."):
-            caption, story, audio_path = generate_content(uploaded_image)
-            st.success("🎉 Your story is ready! 🎉")
-            st.audio(audio_path, format="audio/wav")
-            os.remove(audio_path)
-    else:
-        st.warning("Please upload a picture first! 📸")

 import streamlit as st
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from datasets import Dataset
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, get_linear_schedule_with_warmup
 import numpy as np
+import torch
+from transformers import pipeline
+from collections import Counter
+import time
+from tqdm import tqdm
+import evaluate
+# Function to load and process data
+def load_and_process_data(news_file, trend_file):
+    news_df = pd.read_csv(news_file)
+    trend_df = pd.read_csv(trend_file)
+    trend_df = trend_df.rename(columns={'Symbol': 'Stock'})
+    news_labeled_df = news_df.merge(trend_df[['Stock', 'Trend']], on='Stock', how='left')
+    news_labeled_df = news_labeled_df[news_labeled_df['Trend'].isin(['Positive', 'Negative'])]
+    label_map = {'Negative': 0, 'Positive': 1}
+    news_labeled_df['label'] = news_labeled_df['Trend'].map(label_map)
+    return news_labeled_df
+# Function to check class imbalance
+def check_class_imbalance(df):
+    class_counts = df['label'].value_counts()
+    st.write("**Class Distribution:**", class_counts.to_dict())
+    if class_counts.min() / class_counts.max() < 0.5:
+        st.warning("Warning: Class imbalance detected. Consider balancing techniques.")
+# Function to split data
+def split_data(df):
+    stocks = df['Stock'].unique()
+    train_val_stocks, test_stocks = train_test_split(stocks, test_size=0.2, random_state=42)
+    train_stocks, val_stocks = train_test_split(train_val_stocks, test_size=0.25, random_state=42)
+    train_df = df[df['Stock'].isin(train_stocks)]
+    val_df = df[df['Stock'].isin(val_stocks)]
+    test_df = df[df['Stock'].isin(test_stocks)]
+    return train_df, val_df, test_df
+# Function to tokenize datasets
+def tokenize_datasets(train_df, val_df, test_df, tokenizer):
+    train_dataset = Dataset.from_pandas(train_df[['Headline', 'label']])
+    val_dataset = Dataset.from_pandas(val_df[['Headline', 'label']])
+    test_dataset = Dataset.from_pandas(test_df[['Headline', 'label']])
+    def tokenize_function(examples):
+        return tokenizer(examples['Headline'], padding='max_length', truncation=True, max_length=128)
+    tokenized_train = train_dataset.map(tokenize_function, batched=True)
+    tokenized_val = val_dataset.map(tokenize_function, batched=True)
+    tokenized_test = test_dataset.map(tokenize_function, batched=True)
+    return tokenized_train, tokenized_val, tokenized_test
+# Function to load model with caching
+@st.cache_resource
+def load_model():
+    model = AutoModelForSequenceClassification.from_pretrained(
+        "yiyanghkust/finbert-tone",
+        num_labels=2,
+        ignore_mismatched_sizes=True
     )
+    for param in model.bert.encoder.layer[:6].parameters():
+        param.requires_grad = False
+    return model
+# Function to train model
+def train_model(tokenized_train, tokenized_val, model):
+    training_args = TrainingArguments(
+        output_dir="./results",
+        num_train_epochs=5,
+        per_device_train_batch_size=32,
+        per_device_eval_batch_size=32,
+        eval_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        metric_for_best_model="accuracy",
+        learning_rate=5e-5,
+        weight_decay=0.1,
+        report_to="none",
+    )
+    total_steps = len(tokenized_train) // training_args.per_device_train_batch_size * training_args.num_train_epochs
+    optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_train,
+        eval_dataset=tokenized_val,
+        compute_metrics=lambda eval_pred: {"accuracy": evaluate.load("accuracy").compute(predictions=np.argmax(eval_pred.predictions, axis=1), references=eval_pred.label_ids)},
+        optimizers=(optimizer, get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)),
+    )
+    trainer.train()
+    trainer.save_model("./fine_tuned_model")
+    return trainer
+# Function to evaluate model
+def evaluate_model(pipe, df, model_name=""):
+    results = []
+    total_start = time.perf_counter()
+    for stock, group in tqdm(df.groupby("Stock")):
+        headlines = group["Headline"].tolist()
+        true_trend = group["Trend"].iloc[0]
+        try:
+            preds = pipe(headlines, truncation=True)
+        except Exception as e:
+            st.error(f"Error for {stock}: {e}")
+            continue
+        labels = [p['label'] for p in preds]
+        count = Counter(labels)
+        num_pos, num_neg = count.get("Positive", 0), count.get("Negative", 0)
+        predicted_trend = "Positive" if num_pos > num_neg else "Negative"
+        match = predicted_trend == true_trend
+        results.append(match)
+    total_runtime = time.perf_counter() - total_start
+    accuracy = sum(results) / len(results) if results else 0
+    st.write(f"**🔍 Evaluation Summary for {model_name}**")
+    st.write(f"✅ Accuracy: {accuracy:.2%}")
+    st.write(f"⏱ Total Runtime: {total_runtime:.2f} seconds")
+    return accuracy
+# Streamlit UI
+st.title("Financial Sentiment Analysis with FinBERT")
+st.markdown("Upload your CSV files to train and evaluate a sentiment analysis model on financial news headlines.")
+st.header("Upload CSV Files")
+news_file = st.file_uploader("Upload Train_stock_news.csv", type="csv")
+trend_file = st.file_uploader("Upload Training_price_comparison.csv", type="csv")
+if news_file and trend_file:
+    with st.spinner("Processing data..."):
+        df = load_and_process_data(news_file, trend_file)
+        check_class_imbalance(df)
+        train_df, val_df, test_df = split_data(df)
+        st.write(f"**Training stocks:** {len(train_df['Stock'].unique())}")
+        st.write(f"**Validation stocks:** {len(val_df['Stock'].unique())}")
+        st.write(f"**Test stocks:** {len(test_df['Stock'].unique())}")
+        tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
+        tokenized_train, tokenized_val, tokenized_test = tokenize_datasets(train_df, val_df, test_df, tokenizer)
+        model = load_model()
+        with st.spinner("Training model..."):
+            trainer = train_model(tokenized_train, tokenized_val, model)
+        st.success("Model training completed!")
+        # Evaluate original model
+        original_pipe = pipeline("text-classification", model="yiyanghkust/finbert-tone")
+        st.write("Evaluating original model...")
+        original_accuracy = evaluate_model(original_pipe, test_df, model_name="Original Model")
+        # Evaluate fine-tuned model
+        fine_tuned_pipe = pipeline("text-classification", model="./fine_tuned_model")
+        st.write("Evaluating fine-tuned model...")
+        fine_tuned_accuracy = evaluate_model(fine_tuned_pipe, test_df, model_name="Fine-tuned Model")
+        st.write(f"**Comparison:**")
+        st.write(f"Original Model Accuracy: {original_accuracy:.2%}")
+        st.write(f"Fine-tuned Model Accuracy: {fine_tuned_accuracy:.2%}")
 else:
+    st.warning("Please upload both CSV files to proceed.")