## 导入库

In [1]:
import pandas as pd
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
import numpy as np

2024-06-11 01:22:27.464735: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 01:22:27.464834: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 01:22:27.562994: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## 数据预处理

In [2]:
df = pd.read_csv('/kaggle/input/emotion-analysis-based-on-text/emotion_sentimen_dataset.csv', encoding='utf-8')
df.head()

Unnamed: 0.1,Unnamed: 0,text,Emotion
0,0,i seriously hate one subject to death but now ...,hate
1,1,im so full of life i feel appalled,neutral
2,2,i sit here to write i start to dig out my feel...,neutral
3,3,ive been really angry with r and i feel like a...,anger
4,4,i feel suspicious if there is no one outside l...,neutral


In [3]:
# 删除不需要的列
df = df.drop(columns=['Unnamed: 0'])

In [4]:
dataset = Dataset.from_pandas(df)

train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [5]:
# 实例化标签编码器
label_encoder = LabelEncoder()

# 训练和编码标签
train_dataset = train_dataset.map(lambda examples: {'labels': label_encoder.fit_transform(examples['Emotion'])}, batched=True)
test_dataset = test_dataset.map(lambda examples: {'labels': label_encoder.fit_transform(examples['Emotion'])}, batched=True)

Map:   0%|          | 0/671644 [00:00<?, ? examples/s]

Map:   0%|          | 0/167911 [00:00<?, ? examples/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [7]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/671644 [00:00<?, ? examples/s]

Map:   0%|          | 0/167911 [00:00<?, ? examples/s]

## 加载预训练模型

In [8]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=13)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 评估函数

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': precision_recall_fscore_support(labels, predictions, average='weighted')[2]
    }

## 微调

In [10]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to="none"  # 禁用wandb
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.2512,1.222117,0.692843,0.567131


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=83956, training_loss=1.0833058757935106, metrics={'train_runtime': 40999.4522, 'train_samples_per_second': 16.382, 'train_steps_per_second': 2.048, 'total_flos': 1.7673441503943475e+17, 'train_loss': 1.0833058757935106, 'epoch': 1.0})

## 保存

In [12]:
trainer.save_model("model")