david-thrower commited on
Commit
4c1dfd9
·
verified ·
1 Parent(s): db319a6

Update app.py

Browse files

Try using TorchAO for quantization...

Files changed (1) hide show
  1. app.py +30 -10
app.py CHANGED
@@ -3,16 +3,16 @@ import gc
3
 
4
  import gradio as gr
5
  # import torch
6
- # from transformers import AutoTokenizer, AutoModelForCausalLM, HqqConfig
7
 
8
- # # quant_config = HqqConfig(nbits=8, group_size=64)
9
 
10
  # MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
11
  # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
  # print("Loading tokenizer & model…")
14
  # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
- # # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
16
 
17
  # model =\
18
  # AutoModelForCausalLM\
@@ -23,18 +23,38 @@ import gradio as gr
23
  # # quantization_config=quant_config
24
  # ).to(DEVICE)
25
 
26
- #gc.collect()
27
 
28
  #########
29
 
30
- from unsloth import FastLanguageModel
31
-
32
- model, tokenizer = FastLanguageModel.from_pretrained(
33
- "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
34
- max_seq_length=128_000,
35
- load_in_4bit=True
 
 
 
 
 
 
 
 
 
36
  )
37
 
 
 
 
 
 
 
 
 
 
 
 
38
  #########
39
 
40
  # import gc
 
3
 
4
  import gradio as gr
5
  # import torch
6
+ # from transformers import AutoTokenizer, AutoModelForCausalLM #, HqqConfig
7
 
8
+ # # # quant_config = HqqConfig(nbits=8, group_size=64)
9
 
10
  # MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
11
  # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
  # print("Loading tokenizer & model…")
14
  # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
+ # # # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
16
 
17
  # model =\
18
  # AutoModelForCausalLM\
 
23
  # # quantization_config=quant_config
24
  # ).to(DEVICE)
25
 
26
+ # gc.collect()
27
 
28
  #########
29
 
30
+ import torch
31
+ from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
32
+ from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig
33
+ # quant_config = Float8WeightOnlyConfig()
34
+ quant_config = Float8DynamicActivationFloat8WeightConfig()
35
+ quantization_config = TorchAoConfig(quant_type=quant_config)
36
+
37
+ MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
38
+
39
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
40
+ quantized_model = AutoModelForCausalLM.from_pretrained(
41
+ MODEL_ID,
42
+ torch_dtype="auto",
43
+ device_map="auto",
44
+ quantization_config=quantization_config
45
  )
46
 
47
+
48
+ #########
49
+
50
+ # from unsloth import FastLanguageModel
51
+
52
+ # model, tokenizer = FastLanguageModel.from_pretrained(
53
+ # "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
54
+ # max_seq_length=128_000,
55
+ # load_in_4bit=True
56
+ # )
57
+
58
  #########
59
 
60
  # import gc