metascroy commited on
Commit
8ee4bb2
·
verified ·
1 Parent(s): a05b1b3

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +3 -3
README.md CHANGED
@@ -25,7 +25,7 @@ We provide the [quantized pte](https://huggingface.co/pytorch/Qwen3-4B-INT8-INT4
25
  (The provided pte file is exported with a max_seq_length/max_context_length of 1024; if you wish to change this, re-export the quantized model following the instructions in [Exporting to ExecuTorch](#exporting-to-executorch).)
26
 
27
  # Running in a mobile app
28
- The [pte file](https://huggingface.co/pytorch/Qwen3-4B-INT8-INT4/blob/main/qwen3-4B-INT8-INT4-1024-cxt.pte) can be run with ExecuTorch on a mobile phone. See the [instructions](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) for doing this in iOS.
29
  On iPhone 15 Pro, the model runs at 14.8 tokens/sec and uses 3379 Mb of memory.
30
 
31
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/66049fc71116cebd1d3bdcf4/eVHB7fVllmwVauKJvGu0d.png)
@@ -122,10 +122,10 @@ linear_config = Int8DynamicActivationIntxWeightConfig(
122
  weight_scale_dtype=torch.bfloat16,
123
  )
124
  quant_config = ModuleFqnToConfig({"_default": linear_config, "model.embed_tokens": embedding_config})
125
- quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True, untie_embedding_weights=True, modules_to_not_convert=[])
126
 
127
  # either use `untied_model_id` or `untied_model_local_path`
128
- quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, torch_dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
129
  tokenizer = AutoTokenizer.from_pretrained(model_id)
130
 
131
  # Push to hub
 
25
  (The provided pte file is exported with a max_seq_length/max_context_length of 1024; if you wish to change this, re-export the quantized model following the instructions in [Exporting to ExecuTorch](#exporting-to-executorch).)
26
 
27
  # Running in a mobile app
28
+ The [pte file](https://huggingface.co/pytorch/Qwen3-4B-INT8-INT4/blob/main/model.pte) can be run with ExecuTorch on a mobile phone. See the [instructions](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) for doing this in iOS.
29
  On iPhone 15 Pro, the model runs at 14.8 tokens/sec and uses 3379 Mb of memory.
30
 
31
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/66049fc71116cebd1d3bdcf4/eVHB7fVllmwVauKJvGu0d.png)
 
122
  weight_scale_dtype=torch.bfloat16,
123
  )
124
  quant_config = ModuleFqnToConfig({"_default": linear_config, "model.embed_tokens": embedding_config})
125
+ quantization_config = TorchAoConfig(quant_type=quant_config, include_input_output_embeddings=True, modules_to_not_convert=[])
126
 
127
  # either use `untied_model_id` or `untied_model_local_path`
128
+ quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
129
  tokenizer = AutoTokenizer.from_pretrained(model_id)
130
 
131
  # Push to hub