Spaces:

umang018
/

pg3

Sleeping

pg3

File size: 2,691 Bytes

c1880b4
a72bbed
c1880b4
 
 
a72bbed
c1880b4
1426c29
 
 
c1880b4
616d1b5
c1880b4
1426c29
c1880b4
 
 
 
 
 
 
 
 
602bb55
a3dbcca
1426c29
a72bbed
a3dbcca
1426c29
 
 
 
 
 
 
602bb55
a72bbed
 
ba95aa8
a72bbed
a3dbcca
 
 
1426c29
a72bbed
 
 
 
 
 
 
 
 
 
 
 
 
a3dbcca
 
 
 
 
a72bbed
a3dbcca

import pandas as pd
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from datasets import load_dataset
import time

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model_name = "SamLowe/roberta-base-go_emotions"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

# Define the emotion labels (based on the GoEmotions dataset)
emotion_labels = ["admiration", "amusement", "anger", "annoyance", "approval",
                  "caring", "confusion", "curiosity", "desire", "disappointment",
                  "disapproval", "disgust", "embarrassment", "excitement", "fear",
                  "gratitude", "grief", "joy", "love", "nervousness", "optimism",
                  "pride", "realization", "relief", "remorse", "sadness", "surprise",
                  "neutral"]

# Function to classify emotions in batches
def classify_emotions_in_batches(texts, batch_size=64, num_batches=20):
    results = []
    start_time = time.time()
    for i in range(0, min(num_batches * batch_size, len(texts)), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_class_ids = torch.argmax(logits, dim=-1).tolist()
            results.extend(predicted_class_ids)
        
        # Log progress
        batch_time = time.time() - start_time
        st.write(f"Processed batch {i//batch_size + 1} of {num_batches} in {batch_time:.2f} seconds")
        start_time = time.time()
    
    # Ensure results length matches the processed texts length
    return results[:min(num_batches * batch_size, len(texts))]

# Streamlit interface
st.title("Enron Emails Emotion Analysis")

# Button to run the inference script
if st.button("Run Inference"):
    # Load the Enron dataset
    with st.spinner('Loading dataset...'):
        dataset = load_dataset("Hellisotherpeople/enron_emails_parsed")
        enron_data = pd.DataFrame(dataset['train'])

    # Apply emotion classification to the email content
    with st.spinner('Running inference...'):
        email_texts = enron_data['body'].tolist()
        results = classify_emotions_in_batches(email_texts, batch_size=64)
        
        # Add results to the DataFrame and save
        enron_data = enron_data.iloc[:len(results)].copy()
        enron_data['emotion'] = results

        # Save the results to a CSV f