File size: 4,087 Bytes
444adae
 
 
 
089f657
444adae
089f657
444adae
8687d10
25cd8e8
444adae
 
 
 
 
fffe472
444adae
 
 
 
25cd8e8
eb8847f
 
444adae
25cd8e8
 
 
eb8847f
089f657
c7a5eec
fcff67e
 
 
25cd8e8
 
8687d10
 
 
 
 
fcff67e
 
95d7807
8687d10
089f657
 
8687d10
25cd8e8
089f657
 
fcff67e
089f657
 
444adae
25cd8e8
 
 
fcff67e
444adae
f8a28b3
444adae
fcff67e
 
089f657
8687d10
4113d2a
 
8687d10
 
4113d2a
 
 
 
6f73cc1
4113d2a
 
6f73cc1
4113d2a
8687d10
 
4113d2a
6f73cc1
4113d2a
6f73cc1
4113d2a
 
 
8687d10
 
4113d2a
 
 
 
 
6f73cc1
4113d2a
 
 
8687d10
 
4113d2a
8687d10
 
4113d2a
 
8687d10
 
4113d2a
8687d10
 
 
4113d2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import sys
import time
import threading
import traceback
from datetime import datetime
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from contextlib import asynccontextmanager

# === Ortam değişkenleri
os.environ.setdefault("HF_HOME", "/app/.cache")
os.environ.setdefault("HF_HUB_CACHE", "/app/.cache")

# === Zamanlı log fonksiyonu
def log(message):
    timestamp = datetime.now().strftime("%H:%M:%S")
    print(f"[{timestamp}] {message}", flush=True)

# === Global model değişkenleri
tokenizer = None
model = None

# === Lifespan tanımı
@asynccontextmanager
async def lifespan(app: FastAPI):
    global tokenizer, model
    try:
        model_name = "ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1"
        log(f"⬇️ Model yükleme başlatılıyor: {model_name}")

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token

        quant_config = BitsAndBytesConfig(
            load_in_8bit=True,  # ✅ 8-bit quantization (modern yöntem)
            llm_int8_threshold=6.0
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            quantization_config=quant_config
        )

        log("✅ Model ve tokenizer başarıyla hazır (8-bit quantized, BitsAndBytesConfig).")
        yield  # Uygulama burada çalışır

    except Exception as e:
        log(f"❌ Model yükleme hatası: {e}")
        traceback.print_exc()
        raise

# === FastAPI başlat
app = FastAPI(lifespan=lifespan)

class UserInputRequest(BaseModel):
    user_input: str
    system_prompt: str

@app.post("/generate")
def generate(req: UserInputRequest):
    try:
        overall_start = time.time()
        log(f"💬 Kullanıcı isteği alındı: {req.user_input}")

        # === Apply chat template
        t0 = time.time()
        messages = [
            {"role": "system", "content": req.system_prompt},
            {"role": "user", "content": req.user_input}
        ]
        chat_input = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
        t1 = time.time()
        log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye")

        input_len = chat_input.shape[-1]
        total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
        max_new_tokens = min(512, max(1, total_ctx - input_len))

        log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")

        # === Generate
        t2 = time.time()
        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.get_vocab() else tokenizer.eos_token_id
        ]
        outputs = model.generate(
            input_ids=chat_input,
            max_new_tokens=max_new_tokens,
            eos_token_id=terminators
        )
        t3 = time.time()
        log(f"⏱️ generate süresi: {t3 - t2:.2f} saniye")

        # === Decode
        t4 = time.time()
        response = outputs[0][input_len:]
        answer = tokenizer.decode(response, skip_special_tokens=True)
        t5 = time.time()
        log(f"⏱️ decode süresi: {t5 - t4:.2f} saniye")

        overall_end = time.time()
        overall_elapsed = overall_end - overall_start
        log(f"✅ Toplam yanıt süresi: {overall_elapsed:.2f} saniye")

        return {"response": answer}

    except Exception as e:
        log(f"❌ /generate hatası: {e}")
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/")
def health():
    return {"status": "ok"}

def run_health_server():
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)

threading.Thread(target=run_health_server, daemon=True).start()

log("⏸️ Uygulama bekleme modunda...")
while True:
    time.sleep(60)