gautamnancy commited on
Commit
400c8de
·
verified ·
1 Parent(s): 7723e2c

Upload 7 files

Browse files
README_paraphrase_detection.md ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Paraphrase Detection Pipeline using Transformers
3
+
4
+ This repository provides a complete pipeline to fine-tune a transformer model for **Paraphrase Detection** using the PAWS dataset.
5
+
6
+ ---
7
+
8
+ ## Steps
9
+
10
+ ### 1. Load Dataset
11
+ Load the PAWS dataset which contains pairs of sentences with labels indicating if they are paraphrases or not.
12
+
13
+ ```python
14
+ from datasets import load_dataset
15
+ dataset = load_dataset("paws", "labeled_final")
16
+ ```
17
+
18
+ ### 2. Preprocess and Tokenize
19
+ Tokenize sentence pairs with padding and truncation.
20
+
21
+ ```python
22
+ from transformers import AutoTokenizer
23
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-MiniLM-L6-v2")
24
+
25
+ def preprocess_function(examples):
26
+ return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length", max_length=128)
27
+
28
+ tokenized_datasets = dataset.map(preprocess_function, batched=True)
29
+ ```
30
+
31
+ ### 3. Load Model
32
+ Load a pre-trained sequence classification model suitable for paraphrase detection.
33
+
34
+ ```python
35
+ from transformers import AutoModelForSequenceClassification
36
+ model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/paraphrase-MiniLM-L6-v2", num_labels=2)
37
+ ```
38
+
39
+ ### 4. Fine-tune the Model
40
+ Setup training arguments and fine-tune the model using the Trainer API.
41
+
42
+ ```python
43
+ from transformers import TrainingArguments, Trainer
44
+ import evaluate
45
+
46
+ training_args = TrainingArguments(
47
+ output_dir="./paraphrase-detector",
48
+ evaluation_strategy="epoch",
49
+ save_strategy="epoch",
50
+ learning_rate=2e-5,
51
+ per_device_train_batch_size=16,
52
+ per_device_eval_batch_size=64,
53
+ num_train_epochs=3,
54
+ weight_decay=0.01,
55
+ load_best_model_at_end=True,
56
+ metric_for_best_model="accuracy"
57
+ )
58
+
59
+ accuracy = evaluate.load("accuracy")
60
+
61
+ def compute_metrics(eval_preds):
62
+ logits, labels = eval_preds
63
+ predictions = logits.argmax(axis=-1)
64
+ return accuracy.compute(predictions=predictions, references=labels)
65
+
66
+ trainer = Trainer(
67
+ model=model,
68
+ args=training_args,
69
+ train_dataset=tokenized_datasets["train"],
70
+ eval_dataset=tokenized_datasets["validation"],
71
+ tokenizer=tokenizer,
72
+ compute_metrics=compute_metrics,
73
+ )
74
+
75
+ trainer.train()
76
+ trainer.save_model("paraphrase-detector")
77
+ ```
78
+
79
+ ### 5. Evaluate
80
+ Evaluate the fine-tuned model.
81
+
82
+ ```python
83
+ eval_results = trainer.evaluate()
84
+ print(eval_results)
85
+ ```
86
+
87
+ ### 6. Inference
88
+ Use the fine-tuned model for paraphrase detection inference.
89
+
90
+ ```python
91
+ from transformers import pipeline
92
+
93
+ paraphrase_pipeline = pipeline("text-classification", model="paraphrase-detector", tokenizer=tokenizer)
94
+
95
+ example = paraphrase_pipeline({
96
+ "text": "How old are you?",
97
+ "text_pair": "What is your age?"
98
+ })
99
+
100
+ print(example)
101
+ ```
102
+
103
+ ---
104
+
105
+ ## Requirements
106
+ - `datasets`
107
+ - `transformers`
108
+ - `evaluate`
109
+
110
+ Install dependencies with:
111
+
112
+ ```bash
113
+ pip install datasets transformers evaluate
114
+ ```
115
+
116
+ ---
117
+
118
+ ## Author
119
+ Your Name - [email protected]
120
+
121
+ ---
122
+
123
+ ## License
124
+ MIT License
config (1).json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 384,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 1536,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 6,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "problem_type": "single_label_classification",
21
+ "torch_dtype": "float16",
22
+ "transformers_version": "4.51.3",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
model (2).safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:082b6a4554b030aa6f347938550c83b72a796483f2e9c2a68a3220dfa9eb25fd
3
+ size 45439980
special_tokens_map (1).json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer (1).json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config (1).json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
vocab (1).txt ADDED
The diff for this file is too large to render. See raw diff