danielle2003 commited on
Commit
2b3de4f
·
1 Parent(s): 81a820f

adding scripts'

Browse files
Files changed (3) hide show
  1. scripts/evaluate.py +24 -0
  2. scripts/test.py +16 -0
  3. scripts/train.py +43 -0
scripts/evaluate.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from datasets import load_dataset
3
+ from sklearn.metrics import accuracy_score, f1_score
4
+
5
+ # Load dataset
6
+ dataset = load_dataset("allocine")["test"]
7
+
8
+ # Load model
9
+ classifier = pipeline("text-classification", model="./models")
10
+
11
+ # Get predictions
12
+ predictions = [classifier(text["review"])[0]["label"] for text in dataset]
13
+ labels = dataset["label"]
14
+
15
+ # Convert labels
16
+ label_map = {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}
17
+ predictions = [label_map[p] for p in predictions]
18
+
19
+ # Compute metrics
20
+ accuracy = accuracy_score(labels, predictions)
21
+ f1 = f1_score(labels, predictions, average="weighted")
22
+
23
+ print(f"Accuracy: {accuracy:.4f}")
24
+ print(f"F1-score: {f1:.4f}")
scripts/test.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from transformers import pipeline
3
+
4
+ classifier = pipeline("text-classification", model="./models")
5
+
6
+ class TestModel(unittest.TestCase):
7
+ def test_positive_sentiment(self):
8
+ result = classifier("I love this product!")[0]
9
+ self.assertIn(result["label"], ["LABEL_0", "LABEL_1", "LABEL_2"])
10
+
11
+ def test_negative_sentiment(self):
12
+ result = classifier("This is terrible, I hate it.")[0]
13
+ self.assertIn(result["label"], ["LABEL_0", "LABEL_1", "LABEL_2"])
14
+
15
+ if __name__ == "__main__":
16
+ unittest.main()
scripts/train.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
2
+ from datasets import load_dataset
3
+
4
+ # Load dataset (French dataset example: Allociné)
5
+ dataset = load_dataset("allocine")
6
+
7
+ # Load tokenizer
8
+ model_name = "distilbert-base-multilingual-cased"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+
11
+ # Tokenize data
12
+ def tokenize(batch):
13
+ return tokenizer(batch["review"], padding="max_length", truncation=True)
14
+
15
+ dataset = dataset.map(tokenize, batched=True)
16
+
17
+ # Load model
18
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
19
+
20
+ # Training arguments
21
+ training_args = TrainingArguments(
22
+ output_dir="./models",
23
+ per_device_train_batch_size=8,
24
+ num_train_epochs=3,
25
+ evaluation_strategy="epoch",
26
+ save_steps=1000,
27
+ load_best_model_at_end=True,
28
+ )
29
+
30
+ # Trainer setup
31
+ trainer = Trainer(
32
+ model=model,
33
+ args=training_args,
34
+ train_dataset=dataset["train"],
35
+ eval_dataset=dataset["test"],
36
+ )
37
+
38
+ # Train model
39
+ trainer.train()
40
+
41
+ # Save model
42
+ model.save_pretrained("./models")
43
+ tokenizer.save_pretrained("./models")