import os, torch, zipfile, threading, uvicorn
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, JSONResponse
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from huggingface_hub import hf_hub_download
from datetime import datetime
import random
# === Sabitler ===
HF_TOKEN = os.environ.get("HF_TOKEN")
MODEL_BASE = "mistralai/Mistral-7B-Instruct-v0.2"
FINE_TUNE_ZIP = "trained_model_000_009.zip"
FINE_TUNE_REPO = "UcsTurkey/trained-zips"
USE_SAMPLING = False
CONFIDENCE_THRESHOLD = -1.5
FALLBACK_ANSWERS = [
"Bu konuda maalesef bilgim yok.",
"Ne demek istediğinizi tam anlayamadım.",
"Bu soruya şu an yanıt veremiyorum."
]
# === Log
def log(message):
timestamp = datetime.now().strftime("%H:%M:%S")
print(f"[{timestamp}] {message}")
os.sys.stdout.flush()
# === FastAPI
app = FastAPI()
chat_history = []
model = None
tokenizer = None
class Message(BaseModel):
user_input: str
@app.get("/")
def health():
return {"status": "ok"}
@app.get("/start", response_class=HTMLResponse)
def root():
return """
Mistral 7B Chat
"""
@app.post("/chat")
def chat(msg: Message):
global model, tokenizer
try:
if model is None:
return {"error": "Model yüklenmedi"}
user_input = msg.user_input.strip()
if not user_input:
return {"error": "Boş giriş"}
prompt = f"SORU: {user_input}\nCEVAP:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=128,
do_sample=USE_SAMPLING,
temperature=0.7 if USE_SAMPLING else None,
top_p=0.9 if USE_SAMPLING else None,
top_k=50 if USE_SAMPLING else None,
return_dict_in_generate=True,
output_scores=True,
suppress_tokens=[tokenizer.pad_token_id]
)
decoded = tokenizer.decode(output.sequences[0], skip_special_tokens=True)
answer = decoded[len(prompt):].strip()
if output.scores and len(output.scores) > 0:
first_token_score = output.scores[0][0]
if torch.isnan(first_token_score).any() or torch.isinf(first_token_score).any():
log("⚠️ Geçersiz logit (NaN/Inf) tespit edildi.")
return {"answer": random.choice(FALLBACK_ANSWERS)}
max_score = torch.max(first_token_score).item()
log(f"🔍 İlk token skoru: {max_score:.4f}")
if max_score < CONFIDENCE_THRESHOLD:
answer = random.choice(FALLBACK_ANSWERS)
chat_history.append({"user": user_input, "bot": answer})
log(f"Soru: {user_input} → Cevap: {answer[:60]}...")
return {"answer": answer, "chat_history": chat_history}
except Exception as e:
log(f"❌ /chat hatası: {e}")
return {"error": str(e)}
def detect_env():
device = "cuda" if torch.cuda.is_available() else "cpu"
supports_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
return device, supports_bf16
def setup_model():
global model, tokenizer
try:
log("📦 Zip indiriliyor...")
zip_path = hf_hub_download(
repo_id=FINE_TUNE_REPO,
filename=FINE_TUNE_ZIP,
repo_type="model",
token=HF_TOKEN
)
extract_path = "/app/extracted"
os.makedirs(extract_path, exist_ok=True)
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(extract_path)
tokenizer = AutoTokenizer.from_pretrained(os.path.join(extract_path, "output"))
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
device, supports_bf16 = detect_env()
dtype = torch.bfloat16 if supports_bf16 else torch.float32
log(f"🧠 Ortam: {device.upper()}, dtype: {dtype}")
base = AutoModelForCausalLM.from_pretrained(MODEL_BASE, torch_dtype=dtype).to(device)
peft = PeftModel.from_pretrained(base, os.path.join(extract_path, "output"))
model = peft.model.to(device)
model.eval()
log("✅ Model yüklendi.")
except Exception as e:
log(f"❌ Model setup hatası: {e}")
def run_server():
log("🌐 Uvicorn başlatılıyor...")
uvicorn.run(app, host="0.0.0.0", port=7860)
log("🚀 Başlatılıyor...")
threading.Thread(target=setup_model, daemon=True).start()
threading.Thread(target=run_server, daemon=True).start()
while True:
time.sleep(60)