ciyidogan commited on
Commit
8a39f61
·
verified ·
1 Parent(s): e68dc63

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -7
app.py CHANGED
@@ -18,6 +18,48 @@ def log(message):
18
  timestamp = datetime.now().strftime("%H:%M:%S")
19
  print(f"[{timestamp}] {message}", flush=True)
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # === Global model değişkenleri
22
  tokenizer = None
23
  model = None
@@ -34,7 +76,7 @@ async def lifespan(app: FastAPI):
34
  tokenizer.pad_token = tokenizer.eos_token
35
 
36
  quant_config = BitsAndBytesConfig(
37
- load_in_8bit=True, # ✅ 8-bit quantization (modern BitsAndBytesConfig)
38
  llm_int8_threshold=6.0
39
  )
40
 
@@ -58,6 +100,7 @@ app = FastAPI(lifespan=lifespan)
58
  class UserInputRequest(BaseModel):
59
  user_input: str
60
  system_prompt: str
 
61
 
62
  @app.post("/generate")
63
  def generate(req: UserInputRequest):
@@ -65,17 +108,24 @@ def generate(req: UserInputRequest):
65
  overall_start = time.time()
66
  log(f"💬 Kullanıcı isteği alındı: {req.user_input}")
67
 
 
 
 
 
68
  # === Apply chat template
69
  t0 = time.time()
70
- messages = [
71
- {"role": "system", "content": req.system_prompt},
72
- {"role": "user", "content": req.user_input}
73
- ]
74
- chat_template_str = tokenizer.apply_chat_template(
75
  messages,
76
  add_generation_prompt=True,
77
  return_tensors=None
78
  )
 
 
 
 
 
 
79
  t1 = time.time()
80
  log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye")
81
 
@@ -93,7 +143,7 @@ def generate(req: UserInputRequest):
93
 
94
  input_len = input_ids.shape[-1]
95
  total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
96
- max_new_tokens = min(512, max(1, total_ctx - input_len))
97
 
98
  log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")
99
 
 
18
  timestamp = datetime.now().strftime("%H:%M:%S")
19
  print(f"[{timestamp}] {message}", flush=True)
20
 
21
+ # === Helper fonksiyonlar
22
+ def trim_history(messages, max_blocks=20):
23
+ """
24
+ Mesaj geçmişini en fazla max_blocks (örn. 20: 10 user + 10 assistant) ile sınırla.
25
+ En eski mesajlar atılır.
26
+ """
27
+ return messages[-max_blocks:]
28
+
29
+ def enforce_token_budget(tokenizer, system_prompt, history_messages, user_input, total_ctx=4096, max_new_tokens=300):
30
+ """
31
+ Toplam token limitini aşmamak için history’yi gerekirse budar.
32
+ Her mesaj komple kesilir, kısmen alınmaz.
33
+ """
34
+ system_tokens = len(tokenizer(system_prompt)['input_ids'])
35
+ user_tokens = len(tokenizer(user_input)['input_ids'])
36
+ history_tokens = sum(len(tokenizer(m['content'])['input_ids']) for m in history_messages)
37
+
38
+ log(f"ℹ️ Token hesaplama -> System: {system_tokens}, History: {history_tokens}, User: {user_tokens}")
39
+
40
+ available_budget = total_ctx - max_new_tokens
41
+ total_input_tokens = system_tokens + history_tokens + user_tokens
42
+
43
+ if total_input_tokens <= available_budget:
44
+ log(f"✅ Token bütçesi uygun (toplam {total_input_tokens}/{available_budget})")
45
+ return history_messages
46
+
47
+ trimmed_history = history_messages.copy()
48
+ while trimmed_history:
49
+ current_history_tokens = sum(len(tokenizer(m['content'])['input_ids']) for m in trimmed_history)
50
+ total_input_tokens = system_tokens + current_history_tokens + user_tokens
51
+ if total_input_tokens <= available_budget:
52
+ break
53
+
54
+ removed = trimmed_history.pop(0)
55
+ removed_tokens = len(tokenizer(removed['content'])['input_ids'])
56
+ log(f"⚠️ Token bütçesi aşıldı, en eski {removed['role']} mesajı ({removed_tokens} token) atıldı.")
57
+
58
+ final_tokens = system_tokens + sum(len(tokenizer(m['content'])['input_ids']) for m in trimmed_history) + user_tokens
59
+ log(f"✅ Budanmış token toplamı: {final_tokens}/{available_budget}")
60
+
61
+ return trimmed_history
62
+
63
  # === Global model değişkenleri
64
  tokenizer = None
65
  model = None
 
76
  tokenizer.pad_token = tokenizer.eos_token
77
 
78
  quant_config = BitsAndBytesConfig(
79
+ load_in_8bit=True,
80
  llm_int8_threshold=6.0
81
  )
82
 
 
100
  class UserInputRequest(BaseModel):
101
  user_input: str
102
  system_prompt: str
103
+ history: list # [{"role": "user"/"assistant", "content": "..."}, ...]
104
 
105
  @app.post("/generate")
106
  def generate(req: UserInputRequest):
 
108
  overall_start = time.time()
109
  log(f"💬 Kullanıcı isteği alındı: {req.user_input}")
110
 
111
+ # === History budama
112
+ trimmed_history = trim_history(req.history, max_blocks=20)
113
+ trimmed_history = enforce_token_budget(tokenizer, req.system_prompt, trimmed_history, req.user_input, total_ctx=4096, max_new_tokens=300)
114
+
115
  # === Apply chat template
116
  t0 = time.time()
117
+ messages = [{"role": "system", "content": req.system_prompt}] + trimmed_history + [{"role": "user", "content": req.user_input}]
118
+ chat_template_raw = tokenizer.apply_chat_template(
 
 
 
119
  messages,
120
  add_generation_prompt=True,
121
  return_tensors=None
122
  )
123
+ if chat_template_raw is None:
124
+ chat_template_str = ""
125
+ elif isinstance(chat_template_raw, str):
126
+ chat_template_str = chat_template_raw
127
+ else:
128
+ chat_template_str = str(chat_template_raw)
129
  t1 = time.time()
130
  log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye")
131
 
 
143
 
144
  input_len = input_ids.shape[-1]
145
  total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
146
+ max_new_tokens = min(300, max(1, total_ctx - input_len))
147
 
148
  log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")
149