File size: 6,810 Bytes
40252a8 4c1fee6 40252a8 fe4457a 4c1fee6 fe4457a 40252a8 fe4457a 2072c88 01fdd15 40252a8 4c1fee6 40252a8 fe4457a 4c1fee6 40252a8 ab83111 4c1fee6 ab83111 4c1fee6 fe4457a 4c1fee6 3213c97 4c1fee6 3213c97 40252a8 fe4457a 3213c97 01fdd15 f3f754f 01fdd15 fe4457a 01fdd15 4c1fee6 fe4457a 3213c97 4c1fee6 fe4457a 4c1fee6 3213c97 4c1fee6 3213c97 4c1fee6 fe4457a 4c1fee6 40252a8 fe4457a 40252a8 fe4457a 4c1fee6 3213c97 4c1fee6 40252a8 3213c97 4c1fee6 3213c97 4c1fee6 3213c97 4c1fee6 3213c97 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import os, torch, threading, uvicorn, time, traceback, zipfile, random
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, JSONResponse
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from huggingface_hub import hf_hub_download
from datetime import datetime
# === Ortam
HF_TOKEN = os.getenv("HF_TOKEN")
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
os.environ["TORCH_HOME"] = "/app/.torch_cache"
os.makedirs("/app/.torch_cache", exist_ok=True)
# === Ayarlar
MODEL_BASE = "TURKCELL/Turkcell-LLM-7b-v1" #"mistralai/Mistral-7B-Instruct-v0.2"
USE_FINE_TUNE = False
FINE_TUNE_REPO = "UcsTurkey/trained-zips"
FINE_TUNE_ZIP = "trained_model_000_009.zip"
USE_SAMPLING = False
CONFIDENCE_THRESHOLD = -1.5
FALLBACK_ANSWERS = [
"Bu konuda maalesef bilgim yok.",
"Ne demek istediğinizi tam anlayamadım.",
"Bu soruya şu an yanıt veremiyorum."
]
# === Log
def log(message):
timestamp = time.strftime("%H:%M:%S")
print(f"[{timestamp}] {message}", flush=True)
# === FastAPI
app = FastAPI()
chat_history = []
model = None
tokenizer = None
class Message(BaseModel):
user_input: str
@app.get("/")
def health():
return {"status": "ok"}
@app.get("/start", response_class=HTMLResponse)
def root():
return """
<html>
<body>
<h2>Mistral 7B Instruct Chat</h2>
<textarea id="input" rows="4" cols="60" placeholder="Write your instruction..."></textarea><br>
<button onclick="send()">Gönder</button><br><br>
<label>Model Cevabı:</label><br>
<textarea id="output" rows="10" cols="80" readonly style="white-space: pre-wrap;"></textarea>
<script>
async function send() {
const input = document.getElementById("input").value;
const res = await fetch('/chat', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ user_input: input })
});
const data = await res.json();
document.getElementById('output').value = data.answer || data.error || 'Hata oluştu.';
}
</script>
</body>
</html>
"""
@app.post("/chat")
def chat(msg: Message):
global model, tokenizer
try:
if model is None or tokenizer is None:
return {"error": "Model veya tokenizer henüz yüklenmedi."}
user_input = msg.user_input.strip()
if not user_input:
return {"error": "Boş giriş"}
messages = [{"role": "user", "content": user_input}]
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
if isinstance(input_ids, torch.Tensor):
input_ids = input_ids.to(model.device)
attention_mask = (input_ids != tokenizer.pad_token_id).long()
inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
else:
inputs = {k: v.to(model.device) for k, v in input_ids.items()}
if "attention_mask" not in inputs:
inputs["attention_mask"] = (inputs["input_ids"] != tokenizer.pad_token_id).long()
generate_args = {
"max_new_tokens": 128,
"return_dict_in_generate": True,
"output_scores": True,
"do_sample": USE_SAMPLING,
"pad_token_id": tokenizer.pad_token_id,
"eos_token_id": tokenizer.eos_token_id,
"renormalize_logits": True
}
if USE_SAMPLING:
generate_args.update({
"temperature": 0.7,
"top_p": 0.9,
"top_k": 50
})
with torch.no_grad():
output = model.generate(**inputs, **generate_args)
decoded = tokenizer.decode(output.sequences[0], skip_special_tokens=True)
input_text = tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
answer = decoded.replace(input_text, "").strip()
if output.scores and len(output.scores) > 0:
first_token_score = output.scores[0][0]
if torch.isnan(first_token_score).any() or torch.isinf(first_token_score).any():
log("⚠️ Geçersiz logit (NaN/Inf) tespit edildi.")
return {"answer": random.choice(FALLBACK_ANSWERS)}
max_score = torch.max(first_token_score).item()
log(f"🔍 İlk token skoru: {max_score:.4f}")
if max_score < CONFIDENCE_THRESHOLD:
answer = random.choice(FALLBACK_ANSWERS)
chat_history.append({"user": user_input, "bot": answer})
log(f"Soru: {user_input} → Cevap: {answer[:60]}...")
return {"answer": answer, "chat_history": chat_history}
except Exception as e:
log(f"❌ /chat hatası: {e}")
traceback.print_exc()
return {"error": str(e)}
def detect_env():
return "cuda" if torch.cuda.is_available() else "cpu"
def setup_model():
global model, tokenizer
try:
device = detect_env()
dtype = torch.float32 # Dilersen torch.bfloat16 yapabilirsin
if USE_FINE_TUNE:
log("📦 Fine-tune zip indiriliyor...")
zip_path = hf_hub_download(
repo_id=FINE_TUNE_REPO,
filename=FINE_TUNE_ZIP,
repo_type="model",
token=HF_TOKEN
)
extract_dir = "/app/extracted"
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(extract_dir)
tokenizer = AutoTokenizer.from_pretrained(os.path.join(extract_dir, "output"), use_fast=False)
base_model = AutoModelForCausalLM.from_pretrained(MODEL_BASE, torch_dtype=dtype).to(device)
model = PeftModel.from_pretrained(base_model, os.path.join(extract_dir, "output")).to(device)
else:
log("🧠 Ana model indiriliyor...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(MODEL_BASE, torch_dtype=dtype).to(device)
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.eval()
log("✅ Model başarıyla yüklendi.")
except Exception as e:
log(f"❌ Model yüklenirken hata: {e}")
traceback.print_exc()
def run_server():
log("🌐 Uvicorn başlatılıyor...")
uvicorn.run(app, host="0.0.0.0", port=7860)
log("===== Application Startup =====")
threading.Thread(target=setup_model, daemon=True).start()
threading.Thread(target=run_server, daemon=True).start()
while True:
time.sleep(60)
|