test-oncu / app.py
ciyidogan's picture
Update app.py
e68dc63 verified
raw
history blame
4.54 kB
import os
import sys
import time
import threading
import traceback
from datetime import datetime
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from contextlib import asynccontextmanager
# === Ortam değişkenleri
os.environ.setdefault("HF_HOME", "/app/.cache")
os.environ.setdefault("HF_HUB_CACHE", "/app/.cache")
# === Zamanlı log fonksiyonu
def log(message):
timestamp = datetime.now().strftime("%H:%M:%S")
print(f"[{timestamp}] {message}", flush=True)
# === Global model değişkenleri
tokenizer = None
model = None
# === Lifespan tanımı
@asynccontextmanager
async def lifespan(app: FastAPI):
global tokenizer, model
try:
model_name = "ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1"
log(f"⬇️ Model yükleme başlatılıyor: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
quant_config = BitsAndBytesConfig(
load_in_8bit=True, # ✅ 8-bit quantization (modern BitsAndBytesConfig)
llm_int8_threshold=6.0
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
quantization_config=quant_config
)
log("✅ Model ve tokenizer başarıyla hazır (8-bit quantized, BitsAndBytesConfig).")
yield
except Exception as e:
log(f"❌ Model yükleme hatası: {e}")
traceback.print_exc()
raise
# === FastAPI başlat
app = FastAPI(lifespan=lifespan)
class UserInputRequest(BaseModel):
user_input: str
system_prompt: str
@app.post("/generate")
def generate(req: UserInputRequest):
try:
overall_start = time.time()
log(f"💬 Kullanıcı isteği alındı: {req.user_input}")
# === Apply chat template
t0 = time.time()
messages = [
{"role": "system", "content": req.system_prompt},
{"role": "user", "content": req.user_input}
]
chat_template_str = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors=None
)
t1 = time.time()
log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye")
# === Tokenizer ile input_ids + attention_mask hazırla
t2 = time.time()
tokenized_inputs = tokenizer(
chat_template_str,
return_tensors="pt",
padding=True
).to(model.device)
input_ids = tokenized_inputs['input_ids']
attention_mask = tokenized_inputs['attention_mask']
t3 = time.time()
log(f"⏱️ tokenize süresi: {t3 - t2:.2f} saniye")
input_len = input_ids.shape[-1]
total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
max_new_tokens = min(512, max(1, total_ctx - input_len))
log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")
# === Generate
t4 = time.time()
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.get_vocab() else tokenizer.eos_token_id
]
outputs = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=max_new_tokens,
eos_token_id=terminators
)
t5 = time.time()
log(f"⏱️ generate süresi: {t5 - t4:.2f} saniye")
# === Decode
t6 = time.time()
response = outputs[0][input_len:]
answer = tokenizer.decode(response, skip_special_tokens=True)
t7 = time.time()
log(f"⏱️ decode süresi: {t7 - t6:.2f} saniye")
overall_end = time.time()
overall_elapsed = overall_end - overall_start
log(f"✅ Toplam yanıt süresi: {overall_elapsed:.2f} saniye")
return {"response": answer}
except Exception as e:
log(f"❌ /generate hatası: {e}")
traceback.print_exc()
raise HTTPException(status_code=500, detail=str(e))
@app.get("/")
def health():
return {"status": "ok"}
def run_health_server():
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
threading.Thread(target=run_health_server, daemon=True).start()
log("⏸️ Uygulama bekleme modunda...")
while True:
time.sleep(60)