import os import sys import time import threading import traceback from datetime import datetime from fastapi import FastAPI, HTTPException from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM from contextlib import asynccontextmanager # === Ortam değişkenleri os.environ.setdefault("HF_HOME", "/app/.cache") os.environ.setdefault("HF_HUB_CACHE", "/app/.cache") # === Zamanlı log fonksiyonu def log(message): timestamp = datetime.now().strftime("%H:%M:%S") print(f"[{timestamp}] {message}", flush=True) # === Global model değişkenleri tokenizer = None model = None # === Lifespan tanımı @asynccontextmanager async def lifespan(app: FastAPI): global tokenizer, model try: model_name = "ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1" log(f"⬇️ Model yükleme başlatılıyor: {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto" ) log("✅ Model ve tokenizer başarıyla hazır.") yield # Uygulama burada çalışır except Exception as e: log(f"❌ Model yükleme hatası: {e}") traceback.print_exc() raise # === FastAPI başlat app = FastAPI(lifespan=lifespan) class UserInputRequest(BaseModel): user_input: str system_prompt: str @app.post("/generate") def generate(req: UserInputRequest): try: start_time = time.time() log(f"💬 Kullanıcı isteği alındı: {req.user_input}") messages = [ {"role": "system", "content": req.system_prompt}, {"role": "user", "content": req.user_input} ] chat_input = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ).to(model.device) input_len = chat_input.shape[-1] total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096 max_new_tokens = max(1, total_ctx - input_len) log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}") terminators = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.get_vocab() else tokenizer.eos_token_id ] outputs = model.generate( input_ids=chat_input, max_new_tokens=max_new_tokens, eos_token_id=terminators ) response = outputs[0][input_len:] answer = tokenizer.decode(response, skip_special_tokens=True) end_time = time.time() elapsed = end_time - start_time log(f"✅ Yanıt süresi: {elapsed:.2f} saniye") return {"response": answer} except Exception as e: log(f"❌ /generate hatası: {e}") traceback.print_exc() raise HTTPException(status_code=500, detail=str(e)) @app.get("/") def health(): return {"status": "ok"} def run_health_server(): import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860) threading.Thread(target=run_health_server, daemon=True).start() log("⏸️ Uygulama bekleme modunda...") while True: time.sleep(60)