umang018 commited on
Commit
1426c29
·
verified ·
1 Parent(s): cb1a6c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -47
app.py CHANGED
@@ -1,13 +1,19 @@
1
  import pandas as pd
2
- import streamlit as st
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import torch
5
  from datasets import load_dataset
6
 
 
 
 
 
 
 
 
7
  # Load the model and tokenizer
8
- model_name = "SamLowe/roberta-base-go_emotions"
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
11
 
12
  # Define the emotion labels (based on the GoEmotions dataset)
13
  emotion_labels = ["admiration", "amusement", "anger", "annoyance", "approval",
@@ -17,47 +23,22 @@ emotion_labels = ["admiration", "amusement", "anger", "annoyance", "approval",
17
  "pride", "realization", "relief", "remorse", "sadness", "surprise",
18
  "neutral"]
19
 
20
- # Function to classify emotion
21
- def classify_emotion(text):
22
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
23
- outputs = model(**inputs)
24
- logits = outputs.logits
25
- predicted_class_id = torch.argmax(logits, dim=-1).item()
26
- return emotion_labels[predicted_class_id]
27
-
28
- # Streamlit interface
29
- st.title("Enron Emails Emotion Analysis")
30
-
31
- # Button to run the inference script
32
- if st.button("Run Inference"):
33
- # Load the Enron dataset
34
- with st.spinner('Loading dataset...'):
35
- dataset = load_dataset("Hellisotherpeople/enron_emails_parsed")
36
- enron_data = pd.DataFrame(dataset['train'])
37
-
38
- # Apply emotion classification to the email content
39
- with st.spinner('Running inference...'):
40
- enron_data['emotion'] = enron_data['body'].apply(classify_emotion)
41
-
42
- # Save the results to a CSV file
43
- enron_data.to_csv("enron_emails_with_emotions.csv", index=False)
44
- st.success("Inference completed and results saved!")
45
-
46
- # Check if the results file exists and load it
47
- try:
48
- enron_data = pd.read_csv("enron_emails_with_emotions.csv")
49
-
50
- # Dropdown for selecting an emotion
51
- selected_emotion = st.selectbox("Select Emotion", emotion_labels)
52
-
53
- # Filter emails based on the selected emotion
54
- filtered_emails = enron_data[enron_data['emotion'] == selected_emotion].head(10)
55
-
56
- # Display the filtered emails in a table
57
- if not filtered_emails.empty:
58
- st.write("Top 10 emails with emotion:", selected_emotion)
59
- st.table(filtered_emails[['From', 'To', 'body', 'emotion']])
60
- else:
61
- st.write("No emails found with the selected emotion.")
62
- except FileNotFoundError:
63
- st.warning("Run inference first by clicking the 'Run Inference' button.")
 
1
  import pandas as pd
 
2
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
  import torch
4
  from datasets import load_dataset
5
 
6
+ # Check if GPU is available
7
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+
9
+ # Load the Enron dataset
10
+ dataset = load_dataset("Hellisotherpeople/enron_emails_parsed")
11
+ enron_data = pd.DataFrame(dataset['train'])
12
+
13
  # Load the model and tokenizer
14
+ model_name = "modelSamLowe/roberta-base-go_emotions"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
17
 
18
  # Define the emotion labels (based on the GoEmotions dataset)
19
  emotion_labels = ["admiration", "amusement", "anger", "annoyance", "approval",
 
23
  "pride", "realization", "relief", "remorse", "sadness", "surprise",
24
  "neutral"]
25
 
26
+ # Function to classify emotions in batches
27
+ def classify_emotions_in_batches(texts, batch_size=32):
28
+ results = []
29
+ for i in range(0, len(texts), batch_size):
30
+ batch = texts[i:i+batch_size]
31
+ inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(device)
32
+ with torch.no_grad():
33
+ outputs = model(**inputs)
34
+ logits = outputs.logits
35
+ predicted_class_ids = torch.argmax(logits, dim=-1).tolist()
36
+ results.extend(predicted_class_ids)
37
+ return results
38
+
39
+ # Apply emotion classification to the email content in batches
40
+ email_texts = enron_data['body'].tolist()
41
+ enron_data['emotion'] = classify_emotions_in_batches(email_texts, batch_size=32)
42
+
43
+ # Save the results to a CSV file
44
+ enron_data.to_csv("enron_emails_with_emotions.csv", index=False)