Spaces:
Runtime error
Runtime error
from ctransformers import AutoModelForCausalLM | |
import os | |
# Configure cache | |
os.environ['HF_HOME'] = '/tmp/cache' | |
# Load GGUF model | |
model = AutoModelForCausalLM.from_pretrained( | |
"mradermacher/Ninja-v1-NSFW-RP-GGUF", | |
model_file="ninja-v1.Q5_K_M.gguf", # Medium quantization | |
model_type="llama", | |
gpu_layers=0, # CPU only | |
context_length=4096 # Max context size | |
) | |
def generate_chat_completion(messages, max_tokens=1080, temperature=0.8): | |
"""Generate chat response in OpenAI format""" | |
# Format messages as prompt | |
prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages) | |
prompt += "\nassistant:" | |
# Generate response | |
response = model( | |
prompt, | |
max_new_tokens=max_tokens, | |
temperature=temperature, | |
stop=["</s>", "user:", "system:"], | |
stream=False | |
) | |
return { | |
"id": f"chatcmpl-{os.urandom(8).hex()}", | |
"object": "chat.completion", | |
"created": int(time.time()), | |
"model": "Ninja-v1-NSFW-RP", | |
"choices": [{ | |
"index": 0, | |
"message": { | |
"role": "assistant", | |
"content": response | |
}, | |
"finish_reason": "stop" | |
}], | |
"usage": { | |
"prompt_tokens": len(prompt.split()), | |
"completion_tokens": len(response.split()), | |
"total_tokens": len(prompt.split()) + len(response.split()) | |
} | |
} | |
from transformers import AutoTokenizer | |
# Add after model loading | |
tokenizer = AutoTokenizer.from_pretrained( | |
"mradermacher/Ninja-v1-NSFW-RP-GGUF", | |
use_fast=False | |
) | |
# In generate_chat_completion(): | |
prompt_tokens = len(tokenizer.encode(prompt)) | |
completion_tokens = len(tokenizer.encode(response)) |