shao3d commited on
Commit
216ac5c
·
verified ·
1 Parent(s): bc6163e

Update app.py

Browse files

Переключил модель на CPU без 8-bit

Files changed (1) hide show
  1. app.py +6 -6
app.py CHANGED
@@ -4,19 +4,19 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from peft import PeftModel
5
 
6
  base_model_name = "t-tech/T-lite-it-1.0"
7
- lora_repo = "shao3d/my-t-lite-qlora"
 
8
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
9
  base_model = AutoModelForCausalLM.from_pretrained(
10
  base_model_name,
11
- device_map="auto",
12
- load_in_8bit=True,
13
- torch_dtype=torch.float16
14
  )
15
  model = PeftModel.from_pretrained(base_model, lora_repo)
16
- model.eval()
17
 
18
  def generate_response(prompt):
19
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
20
  outputs = model.generate(
21
  **inputs,
22
  max_new_tokens=200,
 
4
  from peft import PeftModel
5
 
6
  base_model_name = "t-tech/T-lite-it-1.0"
7
+ lora_repo = "shao3d/my-t-lite-qlora" # Замени на твой логин
8
+
9
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
10
  base_model = AutoModelForCausalLM.from_pretrained(
11
  base_model_name,
12
+ device_map="cpu", # Используем CPU вместо "auto"
13
+ torch_dtype=torch.float16 # FP16 для экономии памяти на CPU
 
14
  )
15
  model = PeftModel.from_pretrained(base_model, lora_repo)
16
+ model.eval() # Переключаем в режим предсказания
17
 
18
  def generate_response(prompt):
19
+ inputs = tokenizer(prompt, return_tensors="pt").to("cpu") # Используем CPU
20
  outputs = model.generate(
21
  **inputs,
22
  max_new_tokens=200,