import pandas as pd import streamlit as st from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch from datasets import load_dataset import time # Check if GPU is available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load the model and tokenizer model_name = "SamLowe/roberta-base-go_emotions" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device) # Define the emotion labels (based on the GoEmotions dataset) emotion_labels = ["admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"] # Function to classify emotions in batches def classify_emotions_in_batches(texts, batch_size=64, num_batches=20): results = [] start_time = time.time() for i in range(0, min(num_batches * batch_size, len(texts)), batch_size): batch = texts[i:i+batch_size] inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(device) with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits predicted_class_ids = torch.argmax(logits, dim=-1).tolist() results.extend(predicted_class_ids) # Log progress batch_time = time.time() - start_time st.write(f"Processed batch {i//batch_size + 1} of {num_batches} in {batch_time:.2f} seconds") start_time = time.time() # Ensure results length matches the processed texts length return results[:min(num_batches * batch_size, len(texts))] # Streamlit interface st.title("Enron Emails Emotion Analysis") # Button to run the inference script if st.button("Run Inference"): # Load the Enron dataset with st.spinner('Loading dataset...'): dataset = load_dataset("Hellisotherpeople/enron_emails_parsed") enron_data = pd.DataFrame(dataset['train']) # Apply emotion classification to the email content with st.spinner('Running inference...'): email_texts = enron_data['body'].tolist() results = classify_emotions_in_batches(email_texts, batch_size=64) # Add results to the DataFrame and save enron_data = enron_data.iloc[:len(results)].copy() enron_data['emotion'] = results # Save the results to a CSV f