File size: 6,001 Bytes
0a03f8b 7e32345 9f40e8d 44afc53 9f40e8d 7e32345 9f40e8d 7e32345 b173427 9f40e8d b173427 7e32345 46825d7 7e32345 5787990 7e32345 46825d7 7e32345 b173427 470d3ad 9f40e8d 470d3ad 0a03f8b 44afc53 9f40e8d b173427 9f40e8d b173427 9f40e8d 470d3ad 9f40e8d b173427 9f40e8d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import json
import time
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse, JSONResponse
from llama_cpp import Llama
from huggingface_hub import login, hf_hub_download
import logging
import os
import asyncio
import psutil # Added for RAM tracking
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI()
# Global lock for model access
model_lock = asyncio.Lock()
# Authenticate with Hugging Face
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
logger.error("HF_TOKEN environment variable not set.")
raise ValueError("HF_TOKEN not set")
login(token=hf_token)
# Models Configuration
repo_id = "unsloth/Qwen3-0.6B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
filename = "Qwen3-0.6B-IQ4_XS.gguf" # "deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf"
try:
# Load the model with optimized parameters
logger.info(f"Loading {filename} model")
model_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
token=hf_token,
)
llm = Llama(
model_path=model_path,
n_ctx=3000,
n_threads=2,
n_batch=16,
n_gpu_layers=0,
use_mlock=True,
f16_kv=True,
verbose=True,
batch_prefill=True,
prefill_logits=False,
)
logger.info(f"{filename} model loaded")
except Exception as e:
logger.error(f"Startup error: {str(e)}", exc_info=True)
raise
# RAM Usage Tracking Function
def get_ram_usage():
memory = psutil.virtual_memory()
total_ram = memory.total / (1024 ** 3) # Convert to GB
used_ram = memory.used / (1024 ** 3) # Convert to GB
free_ram = memory.available / (1024 ** 3) # Convert to GB
percent_used = memory.percent
return {
"total_ram_gb": round(total_ram, 2),
"used_ram_gb": round(used_ram, 2),
"free_ram_gb": round(free_ram, 2),
"percent_used": percent_used
}
@app.get("/health")
async def health_check():
return {"status": "healthy"}
@app.get("/model_info")
async def model_info():
return {
"model_name": repo_id,
"model_size": "1.7B",
"quantization": "Q4_K_M",
}
@app.get("/ram_usage")
async def ram_usage():
"""Endpoint to get current RAM usage."""
try:
ram_stats = get_ram_usage()
return ram_stats
except Exception as e:
logger.error(f"Error retrieving RAM usage: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error retrieving RAM usage: {str(e)}")
# @app.on_event("startup")
# async def warm_up_model():
# logger.info("Warming up the model...")
# dummy_query = "Hello"
# dummy_history = []
# async for _ in stream_response(dummy_query, dummy_history):
# pass
# logger.info("Model warm-up completed.")
# # Log initial RAM usage
# ram_stats = get_ram_usage()
# logger.info(f"Initial RAM usage after startup: {ram_stats}")
# Add a background task to keep the model warm
@app.on_event("startup")
async def setup_periodic_tasks():
asyncio.create_task(keep_model_warm())
logger.info("Periodic model warm-up task scheduled")
async def keep_model_warm():
"""Background task that keeps the model warm by sending periodic requests"""
while True:
try:
logger.info("Performing periodic model warm-up")
dummy_query = "Say only the word 'ok.'"
dummy_history = []
# Process a dummy query through the generator to keep it warm
resp = llm.create_chat_completion(
messages=[{"role": "user", "content": dummy_query}],
max_tokens=1,
temperature=0.0,
top_p=1.0,
stream=False,
)
logger.info("Periodic warm-up completed")
except Exception as e:
logger.error(f"Error in periodic warm-up: {str(e)}")
# Wait for 13 minutes before the next warm-up
await asyncio.sleep(13 * 60)
# βββ OpenAIβcompatible endpoint βββββββββββββββββββββββββββββββββββββββββββββ
@app.post("/v1/chat/completions")
async def chat(req: dict):
print("Request:", req)
# if the client (Qwen-Agent) asked for a stream, proxy the SSE events:
if req.get("stream", False):
async def event_generator():
# llama_cpp will now yield tokens/chunks
for chunk in llm.create_chat_completion(
messages=req["messages"],
max_tokens=req.get("max_tokens", 256),
temperature=req.get("temperature", 0.7),
top_p=req.get("top_p", 1.0),
stream=True,
):
# SSE format: data: <json>\n\n
yield f"data: {json.dumps(chunk)}\n\n"
return StreamingResponse(event_generator(),
media_type="text/event-stream")
# otherwise, fall back to the usual non-streaming JSON response
resp = llm.create_chat_completion(
messages=req["messages"],
max_tokens=req.get("max_tokens", 256),
temperature=req.get("temperature", 0.7),
top_p=req.get("top_p", 1.0),
stream=False,
)
return JSONResponse({
"id": resp["id"],
"object": "chat.completion",
"created": resp.get("created", int(time.time())),
"model": "llama-cpp",
"choices": [{
"index": 0,
"message": {
"role": resp["choices"][0]["message"]["role"],
"content": resp["choices"][0]["message"]["content"],
},
"finish_reason": resp["choices"][0].get("finish_reason", "stop"),
}],
"usage": resp.get("usage", {}),
}) |