Update README.md
Browse files
README.md
CHANGED
@@ -77,6 +77,65 @@ print(generated_text)
|
|
77 |
|
78 |
vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
## Evaluation
|
82 |
|
|
|
77 |
|
78 |
vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
|
79 |
|
80 |
+
## Creation
|
81 |
+
|
82 |
+
<details>
|
83 |
+
<summary>Creation details</summary>
|
84 |
+
This model was created with [llm-compressor](https://github.com/vllm-project/llm-compressor) by running the code snippet below.
|
85 |
+
|
86 |
+
|
87 |
+
```python
|
88 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
89 |
+
from llmcompressor.modifiers.quantization import QuantizationModifier
|
90 |
+
from llmcompressor.transformers import oneshot
|
91 |
+
|
92 |
+
# Load model
|
93 |
+
model_stub = "microsoft/phi-4"
|
94 |
+
model_name = model_stub.split("/")[-1]
|
95 |
+
|
96 |
+
num_samples = 1024
|
97 |
+
max_seq_len = 8192
|
98 |
+
|
99 |
+
tokenizer = AutoTokenizer.from_pretrained(model_stub)
|
100 |
+
|
101 |
+
model = AutoModelForCausalLM.from_pretrained(
|
102 |
+
model_stub,
|
103 |
+
device_map="auto",
|
104 |
+
torch_dtype="auto",
|
105 |
+
)
|
106 |
+
|
107 |
+
def preprocess_fn(example):
|
108 |
+
return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
|
109 |
+
|
110 |
+
ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
|
111 |
+
ds = ds.map(preprocess_fn)
|
112 |
+
|
113 |
+
# Configure the quantization algorithm and scheme
|
114 |
+
recipe = QuantizationModifier(
|
115 |
+
targets="Linear",
|
116 |
+
scheme="W4A16",
|
117 |
+
ignore=["lm_head"],
|
118 |
+
dampening_frac=0.01,
|
119 |
+
)
|
120 |
+
|
121 |
+
# Apply quantization
|
122 |
+
oneshot(
|
123 |
+
model=model,
|
124 |
+
dataset=ds,
|
125 |
+
recipe=recipe,
|
126 |
+
max_seq_length=max_seq_len,
|
127 |
+
num_calibration_samples=num_samples,
|
128 |
+
)
|
129 |
+
|
130 |
+
# Save to disk in compressed-tensors format
|
131 |
+
save_path = model_name + "-quantized.w4a16
|
132 |
+
model.save_pretrained(save_path)
|
133 |
+
tokenizer.save_pretrained(save_path)
|
134 |
+
print(f"Model and tokenizer saved to: {save_path}")
|
135 |
+
```
|
136 |
+
</details>
|
137 |
+
|
138 |
+
|
139 |
|
140 |
## Evaluation
|
141 |
|