ciyidogan commited on
Commit
cf581f9
·
verified ·
1 Parent(s): 3ed827f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -8,6 +8,7 @@ from fastapi import FastAPI, HTTPException
8
  from pydantic import BaseModel
9
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
10
  from contextlib import asynccontextmanager
 
11
 
12
  # === Ortam değişkenleri
13
  os.environ.setdefault("HF_HOME", "/app/.cache")
@@ -22,7 +23,7 @@ def log(message):
22
  def trim_history(messages, max_blocks=20):
23
  return messages[-max_blocks:]
24
 
25
- def enforce_token_budget(tokenizer, system_prompt, history_messages, user_input, total_ctx=4096, max_new_tokens=200):
26
  system_tokens = len(tokenizer(system_prompt)['input_ids'])
27
  user_tokens = len(tokenizer(user_input)['input_ids'])
28
  history_tokens = sum(len(tokenizer(m['content'])['input_ids']) for m in history_messages)
@@ -68,8 +69,8 @@ async def lifespan(app: FastAPI):
68
  tokenizer.pad_token = tokenizer.eos_token
69
 
70
  quant_config = BitsAndBytesConfig(
71
- load_in_4bit=True, # ✅ 4-bit quantization
72
- llm_int4_threshold=6.0
73
  )
74
 
75
  model = AutoModelForCausalLM.from_pretrained(
@@ -78,7 +79,7 @@ async def lifespan(app: FastAPI):
78
  quantization_config=quant_config
79
  )
80
 
81
- log("✅ Model ve tokenizer başarıyla hazır (4-bit quantized, BitsAndBytesConfig).")
82
  yield
83
 
84
  except Exception as e:
@@ -101,7 +102,7 @@ def generate(req: UserInputRequest):
101
  log(f"💬 Kullanıcı isteği alındı: {req.user_input}")
102
 
103
  trimmed_history = trim_history(req.history, max_blocks=20)
104
- trimmed_history = enforce_token_budget(tokenizer, req.system_prompt, trimmed_history, req.user_input, total_ctx=4096, max_new_tokens=200)
105
 
106
  # === Apply chat template
107
  t0 = time.time()
@@ -134,7 +135,7 @@ def generate(req: UserInputRequest):
134
 
135
  input_len = input_ids.shape[-1]
136
  total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
137
- max_new_tokens = min(200, max(1, total_ctx - input_len))
138
 
139
  log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")
140
 
 
8
  from pydantic import BaseModel
9
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
10
  from contextlib import asynccontextmanager
11
+ import torch
12
 
13
  # === Ortam değişkenleri
14
  os.environ.setdefault("HF_HOME", "/app/.cache")
 
23
  def trim_history(messages, max_blocks=20):
24
  return messages[-max_blocks:]
25
 
26
+ def enforce_token_budget(tokenizer, system_prompt, history_messages, user_input, total_ctx=4096, max_new_tokens=128):
27
  system_tokens = len(tokenizer(system_prompt)['input_ids'])
28
  user_tokens = len(tokenizer(user_input)['input_ids'])
29
  history_tokens = sum(len(tokenizer(m['content'])['input_ids']) for m in history_messages)
 
69
  tokenizer.pad_token = tokenizer.eos_token
70
 
71
  quant_config = BitsAndBytesConfig(
72
+ load_in_4bit=True,
73
+ bnb_4bit_compute_dtype=torch.float16 # ✅ float16 hızlandırma
74
  )
75
 
76
  model = AutoModelForCausalLM.from_pretrained(
 
79
  quantization_config=quant_config
80
  )
81
 
82
+ log("✅ Model ve tokenizer başarıyla hazır (4-bit quantized, float16 compute).")
83
  yield
84
 
85
  except Exception as e:
 
102
  log(f"💬 Kullanıcı isteği alındı: {req.user_input}")
103
 
104
  trimmed_history = trim_history(req.history, max_blocks=20)
105
+ trimmed_history = enforce_token_budget(tokenizer, req.system_prompt, trimmed_history, req.user_input, total_ctx=4096, max_new_tokens=128)
106
 
107
  # === Apply chat template
108
  t0 = time.time()
 
135
 
136
  input_len = input_ids.shape[-1]
137
  total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
138
+ max_new_tokens = min(128, max(1, total_ctx - input_len))
139
 
140
  log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")
141