|
import os |
|
import sys |
|
import time |
|
import threading |
|
import traceback |
|
from datetime import datetime |
|
from fastapi import FastAPI, HTTPException |
|
from pydantic import BaseModel |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
from contextlib import asynccontextmanager |
|
|
|
|
|
os.environ.setdefault("HF_HOME", "/app/.cache") |
|
os.environ.setdefault("HF_HUB_CACHE", "/app/.cache") |
|
|
|
|
|
def log(message): |
|
timestamp = datetime.now().strftime("%H:%M:%S") |
|
print(f"[{timestamp}] {message}", flush=True) |
|
|
|
|
|
tokenizer = None |
|
model = None |
|
|
|
|
|
@asynccontextmanager |
|
async def lifespan(app: FastAPI): |
|
global tokenizer, model |
|
try: |
|
model_name = "ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1" |
|
log(f"⬇️ Model yükleme başlatılıyor: {model_name}") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
quant_config = BitsAndBytesConfig( |
|
load_in_8bit=True, |
|
llm_int8_threshold=6.0 |
|
) |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
device_map="auto", |
|
quantization_config=quant_config |
|
) |
|
|
|
log("✅ Model ve tokenizer başarıyla hazır (8-bit quantized, BitsAndBytesConfig).") |
|
yield |
|
|
|
except Exception as e: |
|
log(f"❌ Model yükleme hatası: {e}") |
|
traceback.print_exc() |
|
raise |
|
|
|
|
|
app = FastAPI(lifespan=lifespan) |
|
|
|
class UserInputRequest(BaseModel): |
|
user_input: str |
|
system_prompt: str |
|
|
|
@app.post("/generate") |
|
def generate(req: UserInputRequest): |
|
try: |
|
overall_start = time.time() |
|
log(f"💬 Kullanıcı isteği alındı: {req.user_input}") |
|
|
|
|
|
t0 = time.time() |
|
messages = [ |
|
{"role": "system", "content": req.system_prompt}, |
|
{"role": "user", "content": req.user_input} |
|
] |
|
chat_input = tokenizer.apply_chat_template( |
|
messages, |
|
add_generation_prompt=True, |
|
return_tensors="pt" |
|
).to(model.device) |
|
t1 = time.time() |
|
log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye") |
|
|
|
input_len = chat_input.shape[-1] |
|
total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096 |
|
max_new_tokens = min(512, max(1, total_ctx - input_len)) |
|
|
|
log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}") |
|
|
|
|
|
t2 = time.time() |
|
terminators = [ |
|
tokenizer.eos_token_id, |
|
tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.get_vocab() else tokenizer.eos_token_id |
|
] |
|
outputs = model.generate( |
|
input_ids=chat_input, |
|
max_new_tokens=max_new_tokens, |
|
eos_token_id=terminators |
|
) |
|
t3 = time.time() |
|
log(f"⏱️ generate süresi: {t3 - t2:.2f} saniye") |
|
|
|
|
|
t4 = time.time() |
|
response = outputs[0][input_len:] |
|
answer = tokenizer.decode(response, skip_special_tokens=True) |
|
t5 = time.time() |
|
log(f"⏱️ decode süresi: {t5 - t4:.2f} saniye") |
|
|
|
overall_end = time.time() |
|
overall_elapsed = overall_end - overall_start |
|
log(f"✅ Toplam yanıt süresi: {overall_elapsed:.2f} saniye") |
|
|
|
return {"response": answer} |
|
|
|
except Exception as e: |
|
log(f"❌ /generate hatası: {e}") |
|
traceback.print_exc() |
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
@app.get("/") |
|
def health(): |
|
return {"status": "ok"} |
|
|
|
def run_health_server(): |
|
import uvicorn |
|
uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|
|
threading.Thread(target=run_health_server, daemon=True).start() |
|
|
|
log("⏸️ Uygulama bekleme modunda...") |
|
while True: |
|
time.sleep(60) |
|
|