DesiredName commited on
Commit
6d3fbf5
·
verified ·
1 Parent(s): 383a904

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -7
app.py CHANGED
@@ -1,17 +1,26 @@
1
  from fastapi import FastAPI
2
  import uvicorn
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
 
5
  model_name = "TheBloke/Wizard-Vicuna-13B-Uncensored-HF"
6
 
 
 
 
 
 
 
 
 
 
7
  model = AutoModelForCausalLM.from_pretrained(
8
- model_name,
9
- device_map="auto", # Auto-distribute layers across CPU/GPU
10
- low_cpu_mem_usage=True, # Reduces CPU RAM during loading
11
- torch_dtype="auto", # Automatically select dtype (float16/32)
12
- offload_folder="offload",
13
- trust_remote_code=True
14
  )
 
 
15
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
16
  tokenizer.pad_token = tokenizer.eos_token
17
 
 
1
  from fastapi import FastAPI
2
  import uvicorn
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
4
 
5
  model_name = "TheBloke/Wizard-Vicuna-13B-Uncensored-HF"
6
 
7
+ # Configure 4-bit quantization
8
+ bnb_config = BitsAndBytesConfig(
9
+ load_in_4bit=True, # Enable 4-bit quantization
10
+ bnb_4bit_quant_type="nf4", # Use 4-bit NormalFloat (optimal)
11
+ bnb_4bit_compute_dtype="float16", # Faster computation with float16
12
+ bnb_4bit_use_double_quant=True # Extra compression
13
+ )
14
+
15
+ # Load model with quantization
16
  model = AutoModelForCausalLM.from_pretrained(
17
+ model_name, # Example model
18
+ quantization_config=bnb_config,
19
+ device_map="auto", # Auto-distribute across GPU/CPU
20
+ trust_remote_code=True # Required for some models
 
 
21
  )
22
+
23
+ # load tokenizer
24
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
25
  tokenizer.pad_token = tokenizer.eos_token
26