File size: 4,087 Bytes
444adae 089f657 444adae 089f657 444adae 8687d10 25cd8e8 444adae fffe472 444adae 25cd8e8 eb8847f 444adae 25cd8e8 eb8847f 089f657 c7a5eec fcff67e 25cd8e8 8687d10 fcff67e 95d7807 8687d10 089f657 8687d10 25cd8e8 089f657 fcff67e 089f657 444adae 25cd8e8 fcff67e 444adae f8a28b3 444adae fcff67e 089f657 8687d10 4113d2a 8687d10 4113d2a 6f73cc1 4113d2a 6f73cc1 4113d2a 8687d10 4113d2a 6f73cc1 4113d2a 6f73cc1 4113d2a 8687d10 4113d2a 6f73cc1 4113d2a 8687d10 4113d2a 8687d10 4113d2a 8687d10 4113d2a 8687d10 4113d2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import os
import sys
import time
import threading
import traceback
from datetime import datetime
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from contextlib import asynccontextmanager
# === Ortam değişkenleri
os.environ.setdefault("HF_HOME", "/app/.cache")
os.environ.setdefault("HF_HUB_CACHE", "/app/.cache")
# === Zamanlı log fonksiyonu
def log(message):
timestamp = datetime.now().strftime("%H:%M:%S")
print(f"[{timestamp}] {message}", flush=True)
# === Global model değişkenleri
tokenizer = None
model = None
# === Lifespan tanımı
@asynccontextmanager
async def lifespan(app: FastAPI):
global tokenizer, model
try:
model_name = "ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1"
log(f"⬇️ Model yükleme başlatılıyor: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
quant_config = BitsAndBytesConfig(
load_in_8bit=True, # ✅ 8-bit quantization (modern yöntem)
llm_int8_threshold=6.0
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
quantization_config=quant_config
)
log("✅ Model ve tokenizer başarıyla hazır (8-bit quantized, BitsAndBytesConfig).")
yield # Uygulama burada çalışır
except Exception as e:
log(f"❌ Model yükleme hatası: {e}")
traceback.print_exc()
raise
# === FastAPI başlat
app = FastAPI(lifespan=lifespan)
class UserInputRequest(BaseModel):
user_input: str
system_prompt: str
@app.post("/generate")
def generate(req: UserInputRequest):
try:
overall_start = time.time()
log(f"💬 Kullanıcı isteği alındı: {req.user_input}")
# === Apply chat template
t0 = time.time()
messages = [
{"role": "system", "content": req.system_prompt},
{"role": "user", "content": req.user_input}
]
chat_input = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(model.device)
t1 = time.time()
log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye")
input_len = chat_input.shape[-1]
total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
max_new_tokens = min(512, max(1, total_ctx - input_len))
log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")
# === Generate
t2 = time.time()
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.get_vocab() else tokenizer.eos_token_id
]
outputs = model.generate(
input_ids=chat_input,
max_new_tokens=max_new_tokens,
eos_token_id=terminators
)
t3 = time.time()
log(f"⏱️ generate süresi: {t3 - t2:.2f} saniye")
# === Decode
t4 = time.time()
response = outputs[0][input_len:]
answer = tokenizer.decode(response, skip_special_tokens=True)
t5 = time.time()
log(f"⏱️ decode süresi: {t5 - t4:.2f} saniye")
overall_end = time.time()
overall_elapsed = overall_end - overall_start
log(f"✅ Toplam yanıt süresi: {overall_elapsed:.2f} saniye")
return {"response": answer}
except Exception as e:
log(f"❌ /generate hatası: {e}")
traceback.print_exc()
raise HTTPException(status_code=500, detail=str(e))
@app.get("/")
def health():
return {"status": "ok"}
def run_health_server():
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
threading.Thread(target=run_health_server, daemon=True).start()
log("⏸️ Uygulama bekleme modunda...")
while True:
time.sleep(60)
|