Update README.md (#2)

Browse files

- Update README.md (0d2b1b1d927f98d9fecbce2dd9df133927d053a0)

Co-authored-by: Scott Roy <[email protected]>

Files changed (1) hide show

README.md +49 -19

README.md CHANGED Viewed

@@ -10,33 +10,62 @@ tags:
 We used following code to get the quantized model:
 ```
-model_id = "microsoft/Phi-4-mini-instruct"
 from transformers import (
     AutoModelForCausalLM,
     AutoProcessor,
     AutoTokenizer,
 )
 from torchao.quantization.quant_api import (
     Int8DynamicActivationIntxWeightConfig,
-    MappingType,
-    quantize_,
 )
 from torchao.quantization.granularity import PerGroup
 import torch
-model = AutoModelForCausalLM.from_pretrained(
-    model_id, torch_dtype="auto", device_map="auto"
-)
 linear_config = Int8DynamicActivationIntxWeightConfig(
     weight_dtype=torch.int4,
     weight_granularity=PerGroup(32),
-    weight_mapping_type=MappingType.SYMMETRIC,
 )
-quantize_(
-	model,
-    linear_config,
 )
-state_dict = model.state_dict()
 torch.save(state_dict, "phi4-mini-8dq4w.pt")
 ```
@@ -57,12 +86,7 @@ from lm_eval.utils import (
     make_table,
 )
-# model is after calling quantize_ as we do in the recipe
-# quantize_(
-#	model,
-#    linear_config,
-#)
-lm_eval_model = lm_eval.models.huggingface.HFLM(pretrained=model, batch_size=8)
 results = evaluator.simple_evaluate(
     lm_eval_model, tasks=["hellaswag"], device="cuda:0", batch_size="auto"
 )
@@ -86,9 +110,12 @@ Exporting to ExecuTorch requires you clone and install [ExecuTorch](https://gith
 ## Convert quantized checkpoint to ExecuTorch's format
 python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.pt phi4-mini-8dq4w-converted.pt
 ## Export to an ExecuTorch *.pte with XNNPACK
 PARAMS="executorch/examples/models/phi_4_mini/config.json"
 python -m executorch.examples.models.llama.export_llama \
   --model "phi_4_mini" \
@@ -99,11 +126,13 @@ python -m executorch.examples.models.llama.export_llama \
   -X \
   --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
   --output_name="phi4-mini-8dq4w.pte"
 ## Run model with pybindings
 export TOKENIZER="/path/to/tokenizer.json"
 export TOKENIZER_CONFIG="/path/to/tokenizer_config.json"
-export PROMPT="<|system|><|end|><|user|>What is in a california roll?<|end|><|assistant|>"
 python -m executorch.examples.models.llama.runner.native \
   --model phi_4_mini \
   --pte phi4-mini-8dq4w.pte \
@@ -113,4 +142,5 @@ python -m executorch.examples.models.llama.runner.native \
   --prompt "${PROMPT}" \
   --params "${PARAMS}" \
   --max_len 128 \
-  --temperature 0

 We used following code to get the quantized model:
 ```
 from transformers import (
     AutoModelForCausalLM,
     AutoProcessor,
     AutoTokenizer,
+    TorchAoConfig,
 )
 from torchao.quantization.quant_api import (
     Int8DynamicActivationIntxWeightConfig,
 )
 from torchao.quantization.granularity import PerGroup
 import torch
+model_id = "microsoft/Phi-4-mini-instruct"
 linear_config = Int8DynamicActivationIntxWeightConfig(
     weight_dtype=torch.int4,
     weight_granularity=PerGroup(32),
 )
+quantization_config = TorchAoConfig(quant_type=linear_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Push to hub
+USER_ID = "YOUR_USER_ID"
+save_to = f"{USER_ID}/phi4-mini-8dq4w"
+quantized_model.push_to_hub(save_to, safe_serialization=False)
+tokenizer.push_to_hub(save_to)
+# Manual testing
+prompt = "Hey, are you conscious? Can you talk to me?"
+messages = [
+    {
+        "role": "system",
+        "content": "",
+    },
+    {"role": "user", "content": prompt},
+]
+templated_prompt = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
 )
+print("Prompt:", prompt)
+print("Templated prompt:", templated_prompt)
+inputs = tokenizer(
+    templated_prompt,
+    return_tensors="pt",
+).to("cuda")
+generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
+output_text = tokenizer.batch_decode(
+    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print("Response:", output_text[0][len(prompt):])
+# Save to disk
+state_dict = quantized_model.state_dict()
 torch.save(state_dict, "phi4-mini-8dq4w.pt")
 ```
     make_table,
 )
+lm_eval_model = lm_eval.models.huggingface.HFLM(pretrained=quantized_model, batch_size=8)
 results = evaluator.simple_evaluate(
     lm_eval_model, tasks=["hellaswag"], device="cuda:0", batch_size="auto"
 )
 ## Convert quantized checkpoint to ExecuTorch's format
+```
 python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.pt phi4-mini-8dq4w-converted.pt
+```
 ## Export to an ExecuTorch *.pte with XNNPACK
+```
 PARAMS="executorch/examples/models/phi_4_mini/config.json"
 python -m executorch.examples.models.llama.export_llama \
   --model "phi_4_mini" \
   -X \
   --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
   --output_name="phi4-mini-8dq4w.pte"
+```
 ## Run model with pybindings
+```
 export TOKENIZER="/path/to/tokenizer.json"
 export TOKENIZER_CONFIG="/path/to/tokenizer_config.json"
+export PROMPT="<|system|><|end|><|user|>Hey, are you conscious? Can you talk to me?<|end|><|assistant|>"
 python -m executorch.examples.models.llama.runner.native \
   --model phi_4_mini \
   --pte phi4-mini-8dq4w.pte \
   --prompt "${PROMPT}" \
   --params "${PARAMS}" \
   --max_len 128 \
+  --temperature 0
+```