File size: 6,810 Bytes
40252a8
4c1fee6
 
 
 
40252a8
 
fe4457a
4c1fee6
fe4457a
40252a8
fe4457a
 
 
 
 
2072c88
01fdd15
40252a8
 
4c1fee6
 
 
 
 
 
 
 
 
 
40252a8
fe4457a
4c1fee6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40252a8
ab83111
 
 
 
4c1fee6
 
ab83111
4c1fee6
 
 
 
 
 
fe4457a
4c1fee6
 
 
 
 
 
 
 
 
 
3213c97
 
 
4c1fee6
 
 
3213c97
40252a8
fe4457a
 
 
 
 
 
 
 
 
3213c97
01fdd15
f3f754f
01fdd15
 
fe4457a
 
 
 
01fdd15
 
 
 
 
 
 
 
 
4c1fee6
fe4457a
3213c97
4c1fee6
fe4457a
 
4c1fee6
 
 
 
 
 
 
 
 
 
 
 
 
 
3213c97
4c1fee6
 
3213c97
4c1fee6
 
 
fe4457a
4c1fee6
 
 
 
40252a8
fe4457a
40252a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe4457a
4c1fee6
3213c97
4c1fee6
40252a8
3213c97
4c1fee6
 
 
 
 
3213c97
4c1fee6
 
3213c97
4c1fee6
3213c97
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import os, torch, threading, uvicorn, time, traceback, zipfile, random
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, JSONResponse
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from huggingface_hub import hf_hub_download
from datetime import datetime

# === Ortam
HF_TOKEN = os.getenv("HF_TOKEN")
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
os.environ["TORCH_HOME"] = "/app/.torch_cache"
os.makedirs("/app/.torch_cache", exist_ok=True)

# === Ayarlar
MODEL_BASE = "TURKCELL/Turkcell-LLM-7b-v1" #"mistralai/Mistral-7B-Instruct-v0.2"
USE_FINE_TUNE = False
FINE_TUNE_REPO = "UcsTurkey/trained-zips"
FINE_TUNE_ZIP = "trained_model_000_009.zip"
USE_SAMPLING = False
CONFIDENCE_THRESHOLD = -1.5
FALLBACK_ANSWERS = [
    "Bu konuda maalesef bilgim yok.",
    "Ne demek istediğinizi tam anlayamadım.",
    "Bu soruya şu an yanıt veremiyorum."
]

# === Log
def log(message):
    timestamp = time.strftime("%H:%M:%S")
    print(f"[{timestamp}] {message}", flush=True)

# === FastAPI
app = FastAPI()
chat_history = []
model = None
tokenizer = None

class Message(BaseModel):
    user_input: str

@app.get("/")
def health():
    return {"status": "ok"}

@app.get("/start", response_class=HTMLResponse)
def root():
    return """
    <html>
    <body>
        <h2>Mistral 7B Instruct Chat</h2>
        <textarea id="input" rows="4" cols="60" placeholder="Write your instruction..."></textarea><br>
        <button onclick="send()">Gönder</button><br><br>
        <label>Model Cevabı:</label><br>
        <textarea id="output" rows="10" cols="80" readonly style="white-space: pre-wrap;"></textarea>
        <script>
        async function send() {
            const input = document.getElementById("input").value;
            const res = await fetch('/chat', {
                method: 'POST',
                headers: { 'Content-Type': 'application/json' },
                body: JSON.stringify({ user_input: input })
            });
            const data = await res.json();
            document.getElementById('output').value = data.answer || data.error || 'Hata oluştu.';
        }
        </script>
    </body>
    </html>
    """

@app.post("/chat")
def chat(msg: Message):
    global model, tokenizer
    try:
        if model is None or tokenizer is None:
            return {"error": "Model veya tokenizer henüz yüklenmedi."}

        user_input = msg.user_input.strip()
        if not user_input:
            return {"error": "Boş giriş"}

        messages = [{"role": "user", "content": user_input}]
        input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
        if isinstance(input_ids, torch.Tensor):
            input_ids = input_ids.to(model.device)
            attention_mask = (input_ids != tokenizer.pad_token_id).long()
            inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
        else:
            inputs = {k: v.to(model.device) for k, v in input_ids.items()}
            if "attention_mask" not in inputs:
                inputs["attention_mask"] = (inputs["input_ids"] != tokenizer.pad_token_id).long()

        generate_args = {
            "max_new_tokens": 128,
            "return_dict_in_generate": True,
            "output_scores": True,
            "do_sample": USE_SAMPLING,
            "pad_token_id": tokenizer.pad_token_id,
            "eos_token_id": tokenizer.eos_token_id,
            "renormalize_logits": True
        }

        if USE_SAMPLING:
            generate_args.update({
                "temperature": 0.7,
                "top_p": 0.9,
                "top_k": 50
            })

        with torch.no_grad():
            output = model.generate(**inputs, **generate_args)

        decoded = tokenizer.decode(output.sequences[0], skip_special_tokens=True)
        input_text = tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
        answer = decoded.replace(input_text, "").strip()

        if output.scores and len(output.scores) > 0:
            first_token_score = output.scores[0][0]
            if torch.isnan(first_token_score).any() or torch.isinf(first_token_score).any():
                log("⚠️ Geçersiz logit (NaN/Inf) tespit edildi.")
                return {"answer": random.choice(FALLBACK_ANSWERS)}
            max_score = torch.max(first_token_score).item()
            log(f"🔍 İlk token skoru: {max_score:.4f}")
            if max_score < CONFIDENCE_THRESHOLD:
                answer = random.choice(FALLBACK_ANSWERS)

        chat_history.append({"user": user_input, "bot": answer})
        log(f"Soru: {user_input} → Cevap: {answer[:60]}...")
        return {"answer": answer, "chat_history": chat_history}

    except Exception as e:
        log(f"❌ /chat hatası: {e}")
        traceback.print_exc()
        return {"error": str(e)}

def detect_env():
    return "cuda" if torch.cuda.is_available() else "cpu"

def setup_model():
    global model, tokenizer
    try:
        device = detect_env()
        dtype = torch.float32  # Dilersen torch.bfloat16 yapabilirsin

        if USE_FINE_TUNE:
            log("📦 Fine-tune zip indiriliyor...")
            zip_path = hf_hub_download(
                repo_id=FINE_TUNE_REPO,
                filename=FINE_TUNE_ZIP,
                repo_type="model",
                token=HF_TOKEN
            )
            extract_dir = "/app/extracted"
            os.makedirs(extract_dir, exist_ok=True)
            with zipfile.ZipFile(zip_path, "r") as zip_ref:
                zip_ref.extractall(extract_dir)

            tokenizer = AutoTokenizer.from_pretrained(os.path.join(extract_dir, "output"), use_fast=False)
            base_model = AutoModelForCausalLM.from_pretrained(MODEL_BASE, torch_dtype=dtype).to(device)
            model = PeftModel.from_pretrained(base_model, os.path.join(extract_dir, "output")).to(device)
        else:
            log("🧠 Ana model indiriliyor...")
            tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE, use_fast=False)
            model = AutoModelForCausalLM.from_pretrained(MODEL_BASE, torch_dtype=dtype).to(device)

        tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
        model.config.pad_token_id = tokenizer.pad_token_id
        model.eval()
        log("✅ Model başarıyla yüklendi.")
    except Exception as e:
        log(f"❌ Model yüklenirken hata: {e}")
        traceback.print_exc()

def run_server():
    log("🌐 Uvicorn başlatılıyor...")
    uvicorn.run(app, host="0.0.0.0", port=7860)

log("===== Application Startup =====")
threading.Thread(target=setup_model, daemon=True).start()
threading.Thread(target=run_server, daemon=True).start()

while True:
    time.sleep(60)