Upload 7 files
Browse files- README.md +116 -0
- config.json +27 -0
- model.onnx +3 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +65 -0
- vocab.txt +0 -0
README.md
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Sentence-BERT Quantized Model for Text Similarity & Paraphrase Detection
|
3 |
+
|
4 |
+
This repository hosts a quantized version of the Sentence-BERT (SBERT) model, fine-tuned on the Quora Question Pairs dataset for text similarity and paraphrase detection. The model computes semantic similarity between two input sentences and has been optimized for efficient deployment using ONNX quantization.
|
5 |
+
|
6 |
+
## Model Details
|
7 |
+
|
8 |
+
- **Model Architecture:** Sentence-BERT (`all-MiniLM-L6-v2`)
|
9 |
+
- **Task:** Text Similarity & Paraphrase Detection
|
10 |
+
- **Dataset:** Quora Question Pairs (QQP)
|
11 |
+
- **Quantization:** ONNX (Dynamic Quantization)
|
12 |
+
- **Fine-tuning Framework:** Sentence-Transformers (Hugging Face)
|
13 |
+
|
14 |
+
## Usage
|
15 |
+
|
16 |
+
### Installation
|
17 |
+
|
18 |
+
```sh
|
19 |
+
pip install sentence-transformers onnxruntime transformers
|
20 |
+
```
|
21 |
+
|
22 |
+
### Loading the Model
|
23 |
+
|
24 |
+
#### Original Fine-tuned Model
|
25 |
+
|
26 |
+
```python
|
27 |
+
from sentence_transformers import SentenceTransformer
|
28 |
+
|
29 |
+
# Load the fine-tuned model
|
30 |
+
model = SentenceTransformer("fine-tuned-model")
|
31 |
+
|
32 |
+
# Encode two sentences and compute cosine similarity
|
33 |
+
sentence1 = "How can I learn Python?"
|
34 |
+
sentence2 = "What is the best way to study Python?"
|
35 |
+
|
36 |
+
emb1 = model.encode(sentence1)
|
37 |
+
emb2 = model.encode(sentence2)
|
38 |
+
|
39 |
+
# Cosine similarity
|
40 |
+
import numpy as np
|
41 |
+
score = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
|
42 |
+
print("Similarity Score:", score)
|
43 |
+
|
44 |
+
# Threshold to classify as paraphrase
|
45 |
+
print("Paraphrase" if score > 0.75 else "Not Paraphrase")
|
46 |
+
```
|
47 |
+
|
48 |
+
#### Quantized ONNX Model
|
49 |
+
|
50 |
+
```python
|
51 |
+
from onnxruntime import InferenceSession
|
52 |
+
from transformers import AutoTokenizer
|
53 |
+
import numpy as np
|
54 |
+
|
55 |
+
# Load tokenizer and ONNX session
|
56 |
+
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
57 |
+
session = InferenceSession("sbert_onnx/model.onnx")
|
58 |
+
|
59 |
+
def encode_onnx(session, tokenizer, sentence):
|
60 |
+
inputs = tokenizer(sentence, return_tensors="np", padding=True, truncation=True)
|
61 |
+
outputs = session.run(None, dict(inputs))
|
62 |
+
return outputs[0][0]
|
63 |
+
|
64 |
+
# Encode and compute similarity
|
65 |
+
emb1 = encode_onnx(session, tokenizer, sentence1)
|
66 |
+
emb2 = encode_onnx(session, tokenizer, sentence2)
|
67 |
+
score = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
|
68 |
+
print("Quantized Similarity Score:", score)
|
69 |
+
print("Paraphrase" if score > 0.75 else "Not Paraphrase")
|
70 |
+
```
|
71 |
+
|
72 |
+
## Performance Metrics
|
73 |
+
|
74 |
+
- **Accuracy:** ~0.87
|
75 |
+
- **F1 Score:** ~0.85
|
76 |
+
- **Threshold for classification:** 0.75 cosine similarity
|
77 |
+
|
78 |
+
## Fine-Tuning Details
|
79 |
+
|
80 |
+
### Dataset
|
81 |
+
|
82 |
+
- **Source:** Quora Question Pairs (Kaggle)
|
83 |
+
- **Size:** 400K+ question pairs labeled as paraphrase or not
|
84 |
+
|
85 |
+
### Training Configuration
|
86 |
+
|
87 |
+
- **Epochs:** 3
|
88 |
+
- **Batch Size:** 16
|
89 |
+
- **Evaluation Steps:** 1000
|
90 |
+
- **Warmup Steps:** 1000
|
91 |
+
- **Loss Function:** CosineSimilarityLoss
|
92 |
+
|
93 |
+
### Quantization
|
94 |
+
|
95 |
+
- **Method:** ONNX dynamic quantization
|
96 |
+
- **Tool:** Hugging Face Optimum + ONNX Runtime
|
97 |
+
|
98 |
+
## Repository Structure
|
99 |
+
|
100 |
+
```
|
101 |
+
.
|
102 |
+
├── fine-tuned-model/ # Fine-tuned SBERT model directory
|
103 |
+
├── sbert_onnx/ # Quantized ONNX model directory
|
104 |
+
├── test_functions.py # Code for evaluation and testing
|
105 |
+
├── README.md # Project documentation
|
106 |
+
```
|
107 |
+
|
108 |
+
## Limitations
|
109 |
+
|
110 |
+
- The cosine similarity threshold (0.75) may need tuning for different domains.
|
111 |
+
- ONNX quantization may introduce slight performance degradation compared to full-precision models.
|
112 |
+
- SBERT embeddings do not produce classification logits, only similarity scores.
|
113 |
+
|
114 |
+
## Contributing
|
115 |
+
|
116 |
+
Contributions are welcome! Please open an issue or submit a pull request for bug fixes or improvements.
|
config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_attn_implementation_autoset": true,
|
3 |
+
"architectures": [
|
4 |
+
"BertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"export_model_type": "transformer",
|
9 |
+
"gradient_checkpointing": false,
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 384,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 1536,
|
15 |
+
"layer_norm_eps": 1e-12,
|
16 |
+
"max_position_embeddings": 512,
|
17 |
+
"model_type": "bert",
|
18 |
+
"num_attention_heads": 12,
|
19 |
+
"num_hidden_layers": 6,
|
20 |
+
"pad_token_id": 0,
|
21 |
+
"position_embedding_type": "absolute",
|
22 |
+
"torch_dtype": "float32",
|
23 |
+
"transformers_version": "4.51.3",
|
24 |
+
"type_vocab_size": 2,
|
25 |
+
"use_cache": true,
|
26 |
+
"vocab_size": 30522
|
27 |
+
}
|
model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:247b9e85a5ae0b43438349807d309ebb0d2644e51c2509367331effc3cf0028a
|
3 |
+
size 90447733
|
special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "[UNK]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": false,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": true,
|
48 |
+
"extra_special_tokens": {},
|
49 |
+
"mask_token": "[MASK]",
|
50 |
+
"max_length": 128,
|
51 |
+
"model_max_length": 256,
|
52 |
+
"never_split": null,
|
53 |
+
"pad_to_multiple_of": null,
|
54 |
+
"pad_token": "[PAD]",
|
55 |
+
"pad_token_type_id": 0,
|
56 |
+
"padding_side": "right",
|
57 |
+
"sep_token": "[SEP]",
|
58 |
+
"stride": 0,
|
59 |
+
"strip_accents": null,
|
60 |
+
"tokenize_chinese_chars": true,
|
61 |
+
"tokenizer_class": "BertTokenizer",
|
62 |
+
"truncation_side": "right",
|
63 |
+
"truncation_strategy": "longest_first",
|
64 |
+
"unk_token": "[UNK]"
|
65 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|