Update README.md (#2)
Browse files- Update README.md (0d2b1b1d927f98d9fecbce2dd9df133927d053a0)
Co-authored-by: Scott Roy <[email protected]>
README.md
CHANGED
|
@@ -10,33 +10,62 @@ tags:
|
|
| 10 |
We used following code to get the quantized model:
|
| 11 |
|
| 12 |
```
|
| 13 |
-
model_id = "microsoft/Phi-4-mini-instruct"
|
| 14 |
from transformers import (
|
| 15 |
AutoModelForCausalLM,
|
| 16 |
AutoProcessor,
|
| 17 |
AutoTokenizer,
|
|
|
|
| 18 |
)
|
| 19 |
from torchao.quantization.quant_api import (
|
| 20 |
Int8DynamicActivationIntxWeightConfig,
|
| 21 |
-
MappingType,
|
| 22 |
-
quantize_,
|
| 23 |
)
|
| 24 |
from torchao.quantization.granularity import PerGroup
|
| 25 |
import torch
|
| 26 |
|
| 27 |
-
|
| 28 |
-
model_id, torch_dtype="auto", device_map="auto"
|
| 29 |
-
)
|
| 30 |
linear_config = Int8DynamicActivationIntxWeightConfig(
|
| 31 |
weight_dtype=torch.int4,
|
| 32 |
weight_granularity=PerGroup(32),
|
| 33 |
-
weight_mapping_type=MappingType.SYMMETRIC,
|
| 34 |
)
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
)
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
torch.save(state_dict, "phi4-mini-8dq4w.pt")
|
| 41 |
```
|
| 42 |
|
|
@@ -57,12 +86,7 @@ from lm_eval.utils import (
|
|
| 57 |
make_table,
|
| 58 |
)
|
| 59 |
|
| 60 |
-
|
| 61 |
-
# quantize_(
|
| 62 |
-
# model,
|
| 63 |
-
# linear_config,
|
| 64 |
-
#)
|
| 65 |
-
lm_eval_model = lm_eval.models.huggingface.HFLM(pretrained=model, batch_size=8)
|
| 66 |
results = evaluator.simple_evaluate(
|
| 67 |
lm_eval_model, tasks=["hellaswag"], device="cuda:0", batch_size="auto"
|
| 68 |
)
|
|
@@ -86,9 +110,12 @@ Exporting to ExecuTorch requires you clone and install [ExecuTorch](https://gith
|
|
| 86 |
|
| 87 |
|
| 88 |
## Convert quantized checkpoint to ExecuTorch's format
|
|
|
|
| 89 |
python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.pt phi4-mini-8dq4w-converted.pt
|
|
|
|
| 90 |
|
| 91 |
## Export to an ExecuTorch *.pte with XNNPACK
|
|
|
|
| 92 |
PARAMS="executorch/examples/models/phi_4_mini/config.json"
|
| 93 |
python -m executorch.examples.models.llama.export_llama \
|
| 94 |
--model "phi_4_mini" \
|
|
@@ -99,11 +126,13 @@ python -m executorch.examples.models.llama.export_llama \
|
|
| 99 |
-X \
|
| 100 |
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
|
| 101 |
--output_name="phi4-mini-8dq4w.pte"
|
|
|
|
| 102 |
|
| 103 |
## Run model with pybindings
|
|
|
|
| 104 |
export TOKENIZER="/path/to/tokenizer.json"
|
| 105 |
export TOKENIZER_CONFIG="/path/to/tokenizer_config.json"
|
| 106 |
-
export PROMPT="<|system|><|end|><|user|>
|
| 107 |
python -m executorch.examples.models.llama.runner.native \
|
| 108 |
--model phi_4_mini \
|
| 109 |
--pte phi4-mini-8dq4w.pte \
|
|
@@ -113,4 +142,5 @@ python -m executorch.examples.models.llama.runner.native \
|
|
| 113 |
--prompt "${PROMPT}" \
|
| 114 |
--params "${PARAMS}" \
|
| 115 |
--max_len 128 \
|
| 116 |
-
--temperature 0
|
|
|
|
|
|
| 10 |
We used following code to get the quantized model:
|
| 11 |
|
| 12 |
```
|
|
|
|
| 13 |
from transformers import (
|
| 14 |
AutoModelForCausalLM,
|
| 15 |
AutoProcessor,
|
| 16 |
AutoTokenizer,
|
| 17 |
+
TorchAoConfig,
|
| 18 |
)
|
| 19 |
from torchao.quantization.quant_api import (
|
| 20 |
Int8DynamicActivationIntxWeightConfig,
|
|
|
|
|
|
|
| 21 |
)
|
| 22 |
from torchao.quantization.granularity import PerGroup
|
| 23 |
import torch
|
| 24 |
|
| 25 |
+
model_id = "microsoft/Phi-4-mini-instruct"
|
|
|
|
|
|
|
| 26 |
linear_config = Int8DynamicActivationIntxWeightConfig(
|
| 27 |
weight_dtype=torch.int4,
|
| 28 |
weight_granularity=PerGroup(32),
|
|
|
|
| 29 |
)
|
| 30 |
+
quantization_config = TorchAoConfig(quant_type=linear_config)
|
| 31 |
+
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
|
| 32 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 33 |
+
|
| 34 |
+
# Push to hub
|
| 35 |
+
USER_ID = "YOUR_USER_ID"
|
| 36 |
+
save_to = f"{USER_ID}/phi4-mini-8dq4w"
|
| 37 |
+
quantized_model.push_to_hub(save_to, safe_serialization=False)
|
| 38 |
+
tokenizer.push_to_hub(save_to)
|
| 39 |
+
|
| 40 |
+
# Manual testing
|
| 41 |
+
prompt = "Hey, are you conscious? Can you talk to me?"
|
| 42 |
+
messages = [
|
| 43 |
+
{
|
| 44 |
+
"role": "system",
|
| 45 |
+
"content": "",
|
| 46 |
+
},
|
| 47 |
+
{"role": "user", "content": prompt},
|
| 48 |
+
]
|
| 49 |
+
templated_prompt = tokenizer.apply_chat_template(
|
| 50 |
+
messages,
|
| 51 |
+
tokenize=False,
|
| 52 |
+
add_generation_prompt=True,
|
| 53 |
)
|
| 54 |
+
print("Prompt:", prompt)
|
| 55 |
+
print("Templated prompt:", templated_prompt)
|
| 56 |
+
inputs = tokenizer(
|
| 57 |
+
templated_prompt,
|
| 58 |
+
return_tensors="pt",
|
| 59 |
+
).to("cuda")
|
| 60 |
+
generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
|
| 61 |
+
output_text = tokenizer.batch_decode(
|
| 62 |
+
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 63 |
+
)
|
| 64 |
+
print("Response:", output_text[0][len(prompt):])
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# Save to disk
|
| 68 |
+
state_dict = quantized_model.state_dict()
|
| 69 |
torch.save(state_dict, "phi4-mini-8dq4w.pt")
|
| 70 |
```
|
| 71 |
|
|
|
|
| 86 |
make_table,
|
| 87 |
)
|
| 88 |
|
| 89 |
+
lm_eval_model = lm_eval.models.huggingface.HFLM(pretrained=quantized_model, batch_size=8)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
results = evaluator.simple_evaluate(
|
| 91 |
lm_eval_model, tasks=["hellaswag"], device="cuda:0", batch_size="auto"
|
| 92 |
)
|
|
|
|
| 110 |
|
| 111 |
|
| 112 |
## Convert quantized checkpoint to ExecuTorch's format
|
| 113 |
+
```
|
| 114 |
python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.pt phi4-mini-8dq4w-converted.pt
|
| 115 |
+
```
|
| 116 |
|
| 117 |
## Export to an ExecuTorch *.pte with XNNPACK
|
| 118 |
+
```
|
| 119 |
PARAMS="executorch/examples/models/phi_4_mini/config.json"
|
| 120 |
python -m executorch.examples.models.llama.export_llama \
|
| 121 |
--model "phi_4_mini" \
|
|
|
|
| 126 |
-X \
|
| 127 |
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
|
| 128 |
--output_name="phi4-mini-8dq4w.pte"
|
| 129 |
+
```
|
| 130 |
|
| 131 |
## Run model with pybindings
|
| 132 |
+
```
|
| 133 |
export TOKENIZER="/path/to/tokenizer.json"
|
| 134 |
export TOKENIZER_CONFIG="/path/to/tokenizer_config.json"
|
| 135 |
+
export PROMPT="<|system|><|end|><|user|>Hey, are you conscious? Can you talk to me?<|end|><|assistant|>"
|
| 136 |
python -m executorch.examples.models.llama.runner.native \
|
| 137 |
--model phi_4_mini \
|
| 138 |
--pte phi4-mini-8dq4w.pte \
|
|
|
|
| 142 |
--prompt "${PROMPT}" \
|
| 143 |
--params "${PARAMS}" \
|
| 144 |
--max_len 128 \
|
| 145 |
+
--temperature 0
|
| 146 |
+
```
|