Spaces:

umang018
/

pg3

Sleeping

pg3

File size: 3,041 Bytes

c1880b4
a72bbed
c1880b4
 
 
a72bbed
c1880b4
1426c29
 
 
c1880b4
616d1b5
c1880b4
1426c29
c1880b4
 
 
 
 
 
 
 
 
20aca11
 
1426c29
a72bbed
20aca11
 
 
 
1426c29
 
 
 
 
 
 
 
20aca11
 
 
 
 
 
 
 
 
 
 
 
 
 
a72bbed
 
 
 
20aca11
 
1426c29
a72bbed
 
 
 
 
 
 
 
 
 
 
 
 
20aca11
a72bbed

import pandas as pd
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from datasets import load_dataset
import time

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model_name = "SamLowe/roberta-base-go_emotions"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

# Define the emotion labels (based on the GoEmotions dataset)
emotion_labels = ["admiration", "amusement", "anger", "annoyance", "approval",
                  "caring", "confusion", "curiosity", "desire", "disappointment",
                  "disapproval", "disgust", "embarrassment", "excitement", "fear",
                  "gratitude", "grief", "joy", "love", "nervousness", "optimism",
                  "pride", "realization", "relief", "remorse", "sadness", "surprise",
                  "neutral"]

# Function to classify emotions in batches and save results after each batch
def classify_emotions_in_batches(texts, batch_size=64, output_file="enron_emails_with_emotions.csv"):
    results = []
    start_time = time.time()

    # DataFrame to store the results
    result_df = pd.DataFrame(columns=['From', 'To', 'body', 'emotion'])

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_class_ids = torch.argmax(logits, dim=-1).tolist()
            results.extend(predicted_class_ids)

        # Save the batch results
        batch_results = {
            'From': enron_data['From'][i:i+batch_size],
            'To': enron_data['To'][i:i+batch_size],
            'body': batch,
            'emotion': [emotion_labels[idx] for idx in predicted_class_ids]
        }
        batch_df = pd.DataFrame(batch_results)
        result_df = pd.concat([result_df, batch_df])

        # Save to CSV
        result_df.to_csv(output_file, index=False)

        # Log progress
        batch_time = time.time() - start_time
        st.write(f"Processed batch {i//batch_size + 1} of {len(texts)//batch_size + 1} in {batch_time:.2f} seconds")
        start_time = time.time()

    return result_df

# Streamlit interface
st.title("Enron Emails Emotion Analysis")

# Button to run the inference script
if st.button("Run Inference"):
    # Load the Enron dataset
    with st.spinner('Loading dataset...'):
        dataset = load_dataset("Hellisotherpeople/enron_emails_parsed")
        enron_data = pd.DataFrame(dataset['train'])

    # Apply emotion classification to the email content
    with st.spinner('Running inference...'):
        email_texts = enron_data['body'].tolist()
        classify_emotions_in_batches(email_texts, batch_size=64)

    st.success("Inference completed and results saved!")