## 导入库

In [1]:
import re
import nltk
from nltk.corpus.reader.tagged import ToktokTokenizer
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

from scipy.stats import chi2_contingency

import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

## 读取数据

In [2]:
df = pd.read_csv('/kaggle/input/emotion-analysis-based-on-text/emotion_sentimen_dataset.csv', encoding='utf-8')
df.head()

Unnamed: 0.1,Unnamed: 0,text,Emotion
0,0,i seriously hate one subject to death but now ...,hate
1,1,im so full of life i feel appalled,neutral
2,2,i sit here to write i start to dig out my feel...,neutral
3,3,ive been really angry with r and i feel like a...,anger
4,4,i feel suspicious if there is no one outside l...,neutral


## 对情绪标签编码

In [3]:
label_encoder = LabelEncoder()

emotion_encoded = label_encoder.fit_transform(df['Emotion'])
emotion_encoded.shape

(839555,)

## 预测

In [4]:
# 该词汇表由Sima Anjali提供
top3_words_per_class = {
    'empty': ['empty', 'void', 'hollow'],
    'sadness': ['sad', 'melancholy', 'depressed'],
    'enthusiasm': ['enthusiastic', 'excited', 'eager'],
    'neutral': ['neutral', 'indifferent', 'unbiased'],
    'worry': ['worry', 'anxiety', 'concern'],
    'surprise': ['surprise', 'astonishment', 'shock'],
    'love': ['love', 'affection', 'adoration'],
    'fun': ['fun', 'joyful', 'amusing'],
    'hate': ['hate', 'detest', 'loathe'],
    'happiness': ['happy', 'joy', 'content'],
    'boredom': ['boredom', 'tedium', 'monotony'],
    'relief': ['relief', 'ease', 'comfort'],
    'anger': ['angry', 'rage', 'outrage']
}

def predict_emotion(comment):
    for emotion, keywords in top3_words_per_class.items():
        if any(keyword in comment.lower() for keyword in keywords):
            return emotion
    return "neutral"

In [5]:
# 确定top3_words_per_class没有运用最后20%的数据
split = int(len(df['text']) * 0.8)
test_texts = df['text'][split:]
test_labels = df['Emotion'][split:]

# 预测
predicted_emotions = test_texts.apply(lambda x: predict_emotion(x))
test_labels_encoded = label_encoder.transform(test_labels)
predicted_emotions_encoded = label_encoder.transform(predicted_emotions)
accuracy = accuracy_score(test_labels_encoded, predicted_emotions_encoded)
report = classification_report(test_labels_encoded, predicted_emotions_encoded, target_names=label_encoder.classes_)

print('Accuracy:', accuracy)
print('Classification Report:')
print(report)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

       anger       1.00      1.00      1.00      2461
     boredom       1.00      1.00      1.00        30
       empty       1.00      1.00      1.00      1153
  enthusiasm       1.00      1.00      1.00      1877
         fun       1.00      1.00      1.00      2055
   happiness       1.00      1.00      1.00      5364
        hate       1.00      1.00      1.00      3012
        love       1.00      1.00      1.00      8045
     neutral       1.00      1.00      1.00    134788
      relief       1.00      1.00      1.00      3373
     sadness       1.00      1.00      1.00      3535
    surprise       1.00      1.00      1.00      1356
       worry       1.00      1.00      1.00       862

    accuracy                           1.00    167911
   macro avg       1.00      1.00      1.00    167911
weighted avg       1.00      1.00      1.00    167911



In [6]:
# 自己计算得到的top3_words_per_class
top3_words_per_class = {
       'anger': ['word', 'strong', 'anger'],
       'boredom': ['work', 'tortured', 'read'],
       'empty': ['want', 'try', 'trying'],
       'enthusiasm': ['eager', 'loved', 'happy'],
       'fun': ['doomed', 'brain', 'normal'],
       'happiness': ['cool', 'content', 'contented'],
       'hate': ['towards', 'hate', 'today'],
       'love': ['love', 'toward', 'hated'],
       'neutral': ['invigorated', 'valuable', 'shaky'],
       'relief': ['comfort', 'uncomfortable', 'pain'],
       'sadness': ['worry', 'excited', 'sadness'],
       'surprise': ['unpleasant', 'joyful', 'shocked'],
       'worry': ['worried', 'issues', 'questions']
}

In [7]:
# 预测
predicted_emotions = test_texts.apply(lambda x: predict_emotion(x))
test_labels_encoded = label_encoder.transform(test_labels)
predicted_emotions_encoded = label_encoder.transform(predicted_emotions)
accuracy = accuracy_score(test_labels_encoded, predicted_emotions_encoded)
report = classification_report(test_labels_encoded, predicted_emotions_encoded, target_names=label_encoder.classes_)

print('Accuracy:', accuracy)
print('Classification Report:')
print(report)

Accuracy: 0.7027532442782188
Classification Report:
              precision    recall  f1-score   support

       anger       0.02      0.04      0.03      2461
     boredom       0.00      0.13      0.00        30
       empty       0.01      0.12      0.02      1153
  enthusiasm       0.10      0.29      0.15      1877
         fun       0.02      0.01      0.02      2055
   happiness       0.59      0.22      0.32      5364
        hate       0.40      0.79      0.53      3012
        love       0.91      0.49      0.64      8045
     neutral       0.92      0.80      0.85    134788
      relief       0.55      0.47      0.51      3373
     sadness       0.16      0.06      0.09      3535
    surprise       0.39      0.34      0.36      1356
       worry       0.02      0.02      0.02       862

    accuracy                           0.70    167911
   macro avg       0.31      0.29      0.27    167911
weighted avg       0.83      0.70      0.75    167911



Sima Anjali 的词汇表通过观察数据容易得到, 确实非常巧妙, 原文[链接](https://www.kaggle.com/code/simaanjali/sentiment-analysis) ,但是我希望用一种更通用的方式获得top3_words_per_class, 下面是我的一些尝试, 最后翻车了...

## 数据预处理

### 去掉'Unnamed:0'列

In [8]:
if 'Unnamed: 0' in df.columns:
    del df['Unnamed: 0']
df.loc[0]['text']

'i seriously hate one subject to death but now i feel reluctant to drop it'

### 检查缺失值

In [9]:
df.isnull().any()

text       False
Emotion    False
dtype: bool

没有缺失值

### 统计标签个数

In [10]:
df['Emotion'].value_counts()

Emotion
neutral       674538
love           39553
happiness      27175
sadness        17481
relief         16729
hate           15267
anger          12336
fun            10075
enthusiasm      9304
surprise        6954
empty           5542
worry           4475
boredom          126
Name: count, dtype: int64

neutral情绪居多

### 过滤HTML

In [11]:
def noiseremovel_text(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'\[[^]]*\]', '', text)    
    return text

In [12]:
sample_text = '<div>I really enjoyed the latest episode of my favorite show! [https://t.co/xyz123] Check out this link for a recap. #bestshowever [Ad: Stream now on MyStreamingService for 50% off!]</div>'
trans_sample_text = noiseremovel_text(sample_text)
trans_sample_text

'I really enjoyed the latest episode of my favorite show!  Check out this link for a recap. #bestshowever '

In [13]:
df['text'] = df['text'].apply(noiseremovel_text)

### 移除stopwords

In [14]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stop_wr = set(stopwords)

def remove_stopwords(text, stop_words):
   tokenizers = ToktokTokenizer()
   #提取单词和缩写
   words = re.findall(r'\w+|\.\.+', text)
   stop_words = set(stop_words)
   filtokens = [i for i in words if i.lower() not in stop_words]
   # 连接
   filtered_text = ' '.join(filtokens)
   return filtered_text

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
sample_text = 'i seriously hate one subject to death but now i feel reluctant to drop it'
trans_sample_text = remove_stopwords(sample_text, stop_wr)
trans_sample_text

'seriously hate one subject death feel reluctant drop'

In [16]:
df['text'] = df['text'].apply(remove_stopwords, stop_words=stop_wr)

## 获取每个类别的最重要的关键词top10

In [17]:
#使用TF-IDF进行文本向量化
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(df['text'])

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, emotion_encoded, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9709548510818231
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.64      0.77      2489
           1       0.00      0.00      0.00        21
           2       1.00      0.62      0.76      1096
           3       1.00      0.96      0.98      1839
           4       0.98      0.84      0.91      1977
           5       0.98      0.89      0.94      5370
           6       0.98      0.90      0.94      3018
           7       0.99      0.93      0.96      8001
           8       0.97      1.00      0.98    134999
           9       0.98      0.73      0.84      3396
          10       1.00      0.92      0.96      3428
          11       0.99      0.86      0.92      1372
          12       0.99      0.57      0.73       905

    accuracy                           0.97    167911
   macro avg       0.91      0.76      0.82    167911
weighted avg       0.97      0.97      0.97    167911



In [19]:
# 获取每个类别的最重要的关键词
feature_names = vectorizer.get_feature_names_out()
for i, class_label in enumerate(label_encoder.classes_):
    tops = model.coef_[i].argsort()[-10:][::-1]
    print(f"{class_label}: {', '.join([feature_names[idx] for idx in tops])}")

anger: angry, discouraged, feel, gives, upon, happening, support, anger, strong, word
boredom: tortured, dull, dissatisfied, productive, ignored, uncertain, read, work, anything, blank
empty: empty, sad, sadness, love, try, trying, want, aching, tend, depressed
enthusiasm: excited, eager, love, loved, happy, passionate, peaceful, surprised, appreciative, hopeful
fun: funny, fun, joyful, doomed, joy, brain, opportunity, happy, normal, money
happiness: happy, enjoy, unhappy, content, discontent, joy, contented, cool, useful, piece
hate: hate, whatever, hated, towards, comments, angry, feel, today, person, toward
love: love, loved, beloved, lovely, unloved, toward, towards, hate, hated, happy
neutral: adventurous, blessed, bitchy, restless, rude, valuable, overwhelmed, shaky, invigorated, horny
relief: comfortable, uncomfortable, pleased, please, comfort, sweet, sense, pain, relaxed, restless
sadness: sad, depressed, melancholy, sadness, love, shocked, excited, deeply, worry, feel
surpris

## 运用卡方检测筛选top3

In [20]:
# 获取每个类别的最重要的关键词
feature_names = vectorizer.get_feature_names_out()
top_words = {}
for i, class_label in enumerate(label_encoder.classes_):
    tops = model.coef_[i].argsort()[-10:][::-1]
    top_words[class_label] = [feature_names[j] for j in tops]

In [21]:
# 合并所有情绪类别的top10词汇
all_top_words = set(word for words in top_words.values() for word in words)

split = int(len(df['text'])*0.8)

# 使用这些词作为特征进行向量化
vectorizer = CountVectorizer(vocabulary=all_top_words)
X = vectorizer.fit_transform(df['text'][:split])
features = vectorizer.get_feature_names_out()

# 创建频率矩阵
word_counts = np.zeros((len(label_encoder.classes_), len(features)))
for i, text in enumerate(X):
    emotion_index = y_train[i] if i < len(y_train) else y_test[i - len(y_train)]
    word_counts[emotion_index] += text.toarray()[0]

In [22]:
# 进行卡方检验
chi2_results = []
for i, word in enumerate(features):
    # 构建一个2x2的表格来计算每个词的卡方值
    contingency_table = np.array([word_counts[:, i], np.sum(word_counts, axis=1) - word_counts[:, i]]).T
    chi2, p, dof, ex = chi2_contingency(contingency_table)
    chi2_results.append((word, chi2, p))


In [23]:
# 输出结果
chi2_df = pd.DataFrame(chi2_results, columns=['Word', 'Chi2 Statistic', 'p-value'])
print(chi2_df.sort_values(by='p-value'))

           Word  Chi2 Statistic   p-value
26        eager       25.034678  0.014659
18         cool       24.842598  0.015587
24       doomed       24.435655  0.017736
11        brain       23.400643  0.024511
43  invigorated       22.573347  0.031574
..          ...             ...       ...
9         blank        4.887556  0.961629
61        piece        4.496386  0.972726
80        sweet        4.249451  0.978503
35        gives        3.867413  0.985712
1   adventurous        2.220789  0.998985

[102 rows x 3 columns]


In [24]:
# 删除重复关键词并获取每个标签的top3关键词
selected_words = set()
top3_words_per_class = {}
for class_label in label_encoder.classes_:
    class_df = chi2_df[chi2_df['Word'].isin(top_words[class_label])]
    class_df = class_df.sort_values(by='p-value')
    top3_words = []
    for word in class_df['Word']:
        if word not in selected_words:
            top3_words.append(word)
            selected_words.add(word)
        if len(top3_words) == 3:
            break
    top3_words_per_class[class_label] = top3_words


In [25]:
# 输出结果
for class_label, words in top3_words_per_class.items():
    print(f"{class_label}: {', '.join(words)}")

anger: word, strong, anger
boredom: work, tortured, read
empty: want, try, trying
enthusiasm: eager, loved, happy
fun: doomed, brain, normal
happiness: cool, content, contented
hate: towards, hate, today
love: love, toward, hated
neutral: invigorated, valuable, shaky
relief: comfort, uncomfortable, pain
sadness: worry, excited, sadness
surprise: unpleasant, joyful, shocked
worry: worried, issues, questions


In [26]:
top3_words_per_class

{'anger': ['word', 'strong', 'anger'],
 'boredom': ['work', 'tortured', 'read'],
 'empty': ['want', 'try', 'trying'],
 'enthusiasm': ['eager', 'loved', 'happy'],
 'fun': ['doomed', 'brain', 'normal'],
 'happiness': ['cool', 'content', 'contented'],
 'hate': ['towards', 'hate', 'today'],
 'love': ['love', 'toward', 'hated'],
 'neutral': ['invigorated', 'valuable', 'shaky'],
 'relief': ['comfort', 'uncomfortable', 'pain'],
 'sadness': ['worry', 'excited', 'sadness'],
 'surprise': ['unpleasant', 'joyful', 'shocked'],
 'worry': ['worried', 'issues', 'questions']}