from ctransformers import AutoModelForCausalLM import os # Configure cache os.environ['HF_HOME'] = '/tmp/cache' # Load GGUF model model = AutoModelForCausalLM.from_pretrained( "mradermacher/Ninja-v1-NSFW-RP-GGUF", model_file="ninja-v1.Q5_K_M.gguf", # Medium quantization model_type="llama", gpu_layers=0, # CPU only context_length=4096 # Max context size ) def generate_chat_completion(messages, max_tokens=560, temperature=0.7): """Generate chat response in OpenAI format""" # Format messages as prompt prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages) prompt += "\nassistant:" # Generate response response = model( prompt, max_new_tokens=max_tokens, temperature=temperature, stop=["", "user:", "system:"], stream=False ) return { "id": f"chatcmpl-{os.urandom(8).hex()}", "object": "chat.completion", "created": int(time.time()), "model": "Ninja-v1-NSFW-RP", "choices": [{ "index": 0, "message": { "role": "assistant", "content": response }, "finish_reason": "stop" }], "usage": { "prompt_tokens": len(prompt.split()), "completion_tokens": len(response.split()), "total_tokens": len(prompt.split()) + len(response.split()) } }