File size: 1,442 Bytes
3d4be89
efdd63d
29cdb01
3d4be89
d5c6c7d
29cdb01
3d4be89
 
 
 
 
 
 
c278a4e
 
3d4be89
 
 
 
 
 
 
 
d5c6c7d
3d4be89
 
 
 
c293898
3d4be89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from ctransformers import AutoModelForCausalLM
import os

# Configure cache
os.environ['HF_HOME'] = '/tmp/cache'

# Load GGUF model
model = AutoModelForCausalLM.from_pretrained(
    "mradermacher/Ninja-v1-NSFW-RP-GGUF",
    model_file="ninja-v1.Q5_K_M.gguf",  # Medium quantization
    model_type="llama",
    gpu_layers=0,  # CPU only
    context_length=4096  # Max context size
)

def generate_chat_completion(messages, max_tokens=560, temperature=0.7):
    """Generate chat response in OpenAI format"""
    # Format messages as prompt
    prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
    prompt += "\nassistant:"
    
    # Generate response
    response = model(
        prompt,
        max_new_tokens=max_tokens,
        temperature=temperature,
        stop=["</s>", "user:", "system:"],
        stream=False
    )
    
    return {
        "id": f"chatcmpl-{os.urandom(8).hex()}",
        "object": "chat.completion",
        "created": int(time.time()),
        "model": "Ninja-v1-NSFW-RP",
        "choices": [{
            "index": 0,
            "message": {
                "role": "assistant",
                "content": response
            },
            "finish_reason": "stop"
        }],
        "usage": {
            "prompt_tokens": len(prompt.split()),
            "completion_tokens": len(response.split()),
            "total_tokens": len(prompt.split()) + len(response.split())
        }
    }