ciyidogan commited on
Commit
f1e3b60
·
verified ·
1 Parent(s): cf581f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -156
app.py CHANGED
@@ -1,187 +1,97 @@
1
- import os
2
- import sys
3
  import time
4
- import threading
5
- import traceback
6
  from datetime import datetime
7
- from fastapi import FastAPI, HTTPException
8
- from pydantic import BaseModel
9
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
10
- from contextlib import asynccontextmanager
11
  import torch
 
12
 
13
- # === Ortam değişkenleri
14
- os.environ.setdefault("HF_HOME", "/app/.cache")
15
- os.environ.setdefault("HF_HUB_CACHE", "/app/.cache")
16
-
17
- # === Zamanlı log fonksiyonu
18
  def log(message):
19
  timestamp = datetime.now().strftime("%H:%M:%S")
20
- print(f"[{timestamp}] {message}", flush=True)
21
-
22
- # === Helper fonksiyonlar
23
- def trim_history(messages, max_blocks=20):
24
- return messages[-max_blocks:]
25
-
26
- def enforce_token_budget(tokenizer, system_prompt, history_messages, user_input, total_ctx=4096, max_new_tokens=128):
27
- system_tokens = len(tokenizer(system_prompt)['input_ids'])
28
- user_tokens = len(tokenizer(user_input)['input_ids'])
29
- history_tokens = sum(len(tokenizer(m['content'])['input_ids']) for m in history_messages)
30
-
31
- log(f"ℹ️ Token hesaplama -> System: {system_tokens}, History: {history_tokens}, User: {user_tokens}")
32
-
33
- available_budget = total_ctx - max_new_tokens
34
- total_input_tokens = system_tokens + history_tokens + user_tokens
35
-
36
- if total_input_tokens <= available_budget:
37
- log(f"✅ Token bütçesi uygun (toplam {total_input_tokens}/{available_budget})")
38
- return history_messages
39
-
40
- trimmed_history = history_messages.copy()
41
- while trimmed_history:
42
- current_history_tokens = sum(len(tokenizer(m['content'])['input_ids']) for m in trimmed_history)
43
- total_input_tokens = system_tokens + current_history_tokens + user_tokens
44
- if total_input_tokens <= available_budget:
45
- break
46
-
47
- removed = trimmed_history.pop(0)
48
- removed_tokens = len(tokenizer(removed['content'])['input_ids'])
49
- log(f"⚠️ Token bütçesi aşıldı, en eski {removed['role']} mesajı ({removed_tokens} token) atıldı.")
50
 
51
- final_tokens = system_tokens + sum(len(tokenizer(m['content'])['input_ids']) for m in trimmed_history) + user_tokens
52
- log(f"✅ Budanmış token toplamı: {final_tokens}/{available_budget}")
53
 
54
- return trimmed_history
55
-
56
- # === Global model değişkenleri
57
- tokenizer = None
58
- model = None
59
-
60
- # === Lifespan tanımı
61
- @asynccontextmanager
62
- async def lifespan(app: FastAPI):
63
- global tokenizer, model
64
- try:
65
- model_name = "ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1"
66
- log(f"⬇️ Model yükleme başlatılıyor: {model_name}")
67
-
68
- tokenizer = AutoTokenizer.from_pretrained(model_name)
69
- tokenizer.pad_token = tokenizer.eos_token
70
 
71
- quant_config = BitsAndBytesConfig(
72
- load_in_4bit=True,
73
- bnb_4bit_compute_dtype=torch.float16 # ✅ float16 hızlandırma
74
- )
75
 
76
- model = AutoModelForCausalLM.from_pretrained(
77
- model_name,
78
- device_map="auto",
79
- quantization_config=quant_config
80
- )
81
 
82
- log("✅ Model ve tokenizer başarıyla hazır (4-bit quantized, float16 compute).")
83
- yield
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- except Exception as e:
86
- log(f"❌ Model yükleme hatası: {e}")
87
- traceback.print_exc()
88
- raise
 
89
 
90
- # === FastAPI başlat
91
- app = FastAPI(lifespan=lifespan)
92
 
93
- class UserInputRequest(BaseModel):
94
- user_input: str
95
- system_prompt: str
96
- history: list # [{"role": "user"/"assistant", "content": "..."}, ...]
97
 
98
- @app.post("/generate")
99
- def generate(req: UserInputRequest):
100
  try:
101
- overall_start = time.time()
102
- log(f"�� Kullanıcı isteği alındı: {req.user_input}")
103
-
104
- trimmed_history = trim_history(req.history, max_blocks=20)
105
- trimmed_history = enforce_token_budget(tokenizer, req.system_prompt, trimmed_history, req.user_input, total_ctx=4096, max_new_tokens=128)
106
-
107
- # === Apply chat template
108
- t0 = time.time()
109
- messages = [{"role": "system", "content": req.system_prompt}] + trimmed_history + [{"role": "user", "content": req.user_input}]
110
- chat_template_raw = tokenizer.apply_chat_template(
111
  messages,
112
  add_generation_prompt=True,
113
- return_tensors=None
114
- )
115
- if chat_template_raw is None:
116
- chat_template_str = ""
117
- elif isinstance(chat_template_raw, str):
118
- chat_template_str = chat_template_raw
119
- else:
120
- chat_template_str = str(chat_template_raw)
121
- t1 = time.time()
122
- log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye")
123
-
124
- # === Tokenizer ile input_ids + attention_mask hazırla
125
- t2 = time.time()
126
- tokenized_inputs = tokenizer(
127
- chat_template_str,
128
- return_tensors="pt",
129
- padding=True
130
  ).to(model.device)
131
- input_ids = tokenized_inputs['input_ids']
132
- attention_mask = tokenized_inputs['attention_mask']
133
- t3 = time.time()
134
- log(f"⏱️ tokenize süresi: {t3 - t2:.2f} saniye")
135
-
136
- input_len = input_ids.shape[-1]
137
- total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
138
- max_new_tokens = min(128, max(1, total_ctx - input_len))
139
 
140
- log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")
141
-
142
- # === Generate
143
- t4 = time.time()
144
  terminators = [
145
  tokenizer.eos_token_id,
146
- tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.get_vocab() else tokenizer.eos_token_id
147
  ]
 
 
 
148
  outputs = model.generate(
149
- input_ids=input_ids,
150
- attention_mask=attention_mask,
151
- max_new_tokens=max_new_tokens,
152
- eos_token_id=terminators
 
 
153
  )
154
- t5 = time.time()
155
- log(f"⏱️ generate süresi: {t5 - t4:.2f} saniye")
156
 
157
- # === Decode
158
- t6 = time.time()
159
- response = outputs[0][input_len:]
160
- answer = tokenizer.decode(response, skip_special_tokens=True)
161
- t7 = time.time()
162
- log(f"⏱️ decode süresi: {t7 - t6:.2f} saniye")
163
-
164
- overall_end = time.time()
165
- overall_elapsed = overall_end - overall_start
166
- log(f"✅ Toplam yanıt süresi: {overall_elapsed:.2f} saniye")
167
-
168
- return {"response": answer}
169
 
170
  except Exception as e:
171
- log(f"❌ /generate hatası: {e}")
172
- traceback.print_exc()
173
- raise HTTPException(status_code=500, detail=str(e))
174
-
175
- @app.get("/")
176
- def health():
177
- return {"status": "ok"}
178
-
179
- def run_health_server():
180
- import uvicorn
181
- uvicorn.run(app, host="0.0.0.0", port=7860)
182
-
183
- threading.Thread(target=run_health_server, daemon=True).start()
184
 
185
- log("⏸️ Uygulama bekleme modunda...")
 
186
  while True:
187
  time.sleep(60)
 
 
 
1
  import time
2
+ import sys
 
3
  from datetime import datetime
4
+ from fastapi import FastAPI, Request
5
+ import uvicorn
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
 
7
  import torch
8
+ import threading
9
 
10
+ # 🕒 Zamanlı log fonksiyonu
 
 
 
 
11
  def log(message):
12
  timestamp = datetime.now().strftime("%H:%M:%S")
13
+ print(f"[{timestamp}] {message}")
14
+ sys.stdout.flush()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Health check sunucusu
17
+ app = FastAPI()
18
 
19
+ @app.get("/")
20
+ def health():
21
+ return {"status": "ok"}
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ def run_health_server():
24
+ uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
25
 
26
+ threading.Thread(target=run_health_server, daemon=True).start()
 
 
 
 
27
 
28
+ # ✅ Model yükleme
29
+ MODEL_ID = "ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1"
30
+ log("⬇️ Model ve tokenizer yükleme başlatılıyor...")
31
+ start_time = time.time()
32
+ try:
33
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
34
+ model = AutoModelForCausalLM.from_pretrained(
35
+ MODEL_ID,
36
+ torch_dtype=torch.bfloat16,
37
+ device_map="auto",
38
+ )
39
+ log(f"✅ Model yüklendi. Süre: {time.time() - start_time:.2f} sn")
40
+ except Exception as e:
41
+ log(f"❌ Model yükleme hatası: {e}")
42
+ sys.exit(1)
43
 
44
+ @app.post("/generate")
45
+ async def generate(request: Request):
46
+ req_data = await request.json()
47
+ user_input = req_data.get("user_input", "")
48
+ system_prompt = req_data.get("system_prompt", "")
49
 
50
+ if not user_input or not system_prompt:
51
+ return {"error": "user_input ve system_prompt zorunludur."}
52
 
53
+ messages = [
54
+ {"role": "system", "content": system_prompt},
55
+ {"role": "user", "content": user_input},
56
+ ]
57
 
 
 
58
  try:
59
+ log("🧩 Input preparation başlatılıyor...")
60
+ prep_start = time.time()
61
+ input_ids = tokenizer.apply_chat_template(
 
 
 
 
 
 
 
62
  messages,
63
  add_generation_prompt=True,
64
+ return_tensors="pt"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  ).to(model.device)
66
+ log(f"✅ Input hazırlandı. Süre: {time.time() - prep_start:.2f} sn")
 
 
 
 
 
 
 
67
 
 
 
 
 
68
  terminators = [
69
  tokenizer.eos_token_id,
70
+ tokenizer.convert_tokens_to_ids("<|eot_id|>")
71
  ]
72
+
73
+ log("🧠 Generate çağrısı başlatılıyor...")
74
+ gen_start = time.time()
75
  outputs = model.generate(
76
+ input_ids,
77
+ max_new_tokens=256,
78
+ eos_token_id=terminators,
79
+ do_sample=True,
80
+ temperature=0.6,
81
+ top_p=0.9,
82
  )
83
+ log(f"✅ Generate tamamlandı. Süre: {time.time() - gen_start:.2f} sn")
 
84
 
85
+ response = outputs[0][input_ids.shape[-1]:]
86
+ decoded_output = tokenizer.decode(response, skip_special_tokens=True)
87
+ log("✅ Cevap başarıyla decode edildi.")
88
+ return {"response": decoded_output}
 
 
 
 
 
 
 
 
89
 
90
  except Exception as e:
91
+ log(f"❌ Generate hatası: {e}")
92
+ return {"error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ # 🧘 Eğitim sonrası uygulama restart olmasın diye bekleme
95
+ log("⏸️ Uygulama hazır, bekleme modunda...")
96
  while True:
97
  time.sleep(60)