File size: 4,951 Bytes
7d65968
 
 
61d4f38
a745fd4
7d65968
3067905
7d65968
3067905
 
 
 
7d65968
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61d4f38
7d65968
 
61d4f38
7d65968
 
 
 
999bf7c
 
 
7d65968
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a745fd4
61d4f38
 
 
 
 
 
 
 
 
a745fd4
7d65968
a745fd4
 
 
7d65968
61d4f38
 
 
 
7d65968
 
 
61d4f38
 
 
7d65968
 
a745fd4
61d4f38
 
 
 
 
 
a745fd4
7d65968
a745fd4
7d65968
 
 
3067905
 
 
 
7d65968
3067905
 
 
 
a745fd4
 
 
 
 
 
3067905
7d65968
 
a745fd4
 
 
3067905
a745fd4
 
 
3067905
 
 
 
 
7d65968
3067905
 
7d65968
 
 
3067905
a745fd4
61d4f38
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
from typing import List, Tuple, Optional

from fastapi import FastAPI
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Ensure models directory exists
MODEL_DIR = "./models"
os.makedirs(MODEL_DIR, exist_ok=True)

# Model info for download
MODELS_INFO = [
    {
        "repo_id": "bartowski/Dolphin3.0-Llama3.2-1B-GGUF",
        "filename": "Dolphin3.0-Llama3.2-1B-Q4_K_M.gguf"
    },
    {
        "repo_id": "bartowski/Dolphin3.0-Qwen2.5-0.5B-GGUF",
        "filename": "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf"
    },
    {
        "repo_id": "bartowski/Qwen2.5-Coder-14B-Instruct-GGUF",
        "filename": "Qwen2.5-Coder-14B-Instruct-Q6_K.gguf"
    }
]

# Download all models if not present
for model_info in MODELS_INFO:
    model_path = os.path.join(MODEL_DIR, model_info["filename"])
    if not os.path.exists(model_path):
        print(f"Downloading {model_info['filename']} from {model_info['repo_id']}...")
        try:
            hf_hub_download(
                repo_id=model_info["repo_id"],
                filename=model_info["filename"],
                local_dir=MODEL_DIR
            )
            print(f"Downloaded {model_info['filename']}")
        except Exception as e:
            print(f"Error downloading {model_info['filename']}: {e}")

# Available model keys (used in API)
AVAILABLE_MODELS = {
    "qwen": "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf",
    "llama": "Dolphin3.0-Llama3.2-1B-Q4_K_M.gguf",
    "coder": "Qwen2.5-Coder-14B-Instruct-Q6_K.gguf"
}

# Global LLM instance
llm = None
llm_model = None

def load_model(model_key: str):
    global llm, llm_model
    model_name = AVAILABLE_MODELS.get(model_key)
    if not model_name:
        raise ValueError(f"Invalid model key: {model_key}")

    model_path = os.path.join(MODEL_DIR, model_name)
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at {model_path}")

    if llm is None or llm_model != model_name:
        llm = Llama(
            model_path=model_path,
            flash_attn=False,
            n_gpu_layers=0,
            n_batch=8,
            n_ctx=2048,
            n_threads=8,
            n_threads_batch=8,
        )
        llm_model = model_name
    return llm


class ChatRequest(BaseModel):
    message: str  # Required
    history: Optional[List[Tuple[str, str]]] = []  # Default: empty list
    model: Optional[str] = "qwen"  # Default model key
    system_prompt: Optional[str] = "You are Dolphin, a helpful AI assistant."
    max_tokens: Optional[int] = 1024
    temperature: Optional[float] = 0.7
    top_p: Optional[float] = 0.95
    top_k: Optional[int] = 40
    repeat_penalty: Optional[float] = 1.1


class ChatResponse(BaseModel):
    response: str


class ModelInfoResponse(BaseModel):
    models: List[str]


app = FastAPI(
    title="Dolphin 3.0 LLM API",
    description="REST API for Dolphin 3.0 models using Llama.cpp backend.",
    version="1.0",
    docs_url="/docs",  # Only Swagger docs
    redoc_url=None    # Disable ReDoc
)


@app.get("/models", response_model=ModelInfoResponse)
def get_available_models():
    """Returns the list of supported models."""
    return {"models": list(AVAILABLE_MODELS.keys())}


@app.post("/chat", response_model=ChatResponse)
def chat(request: ChatRequest):
    try:
        # Load model
        load_model(request.model)

        provider = LlamaCppPythonProvider(llm)

        agent = LlamaCppAgent(
            provider,
            system_prompt=request.system_prompt,
            predefined_messages_formatter_type=MessagesFormatterType.CHATML,
        )

        settings = provider.get_provider_default_settings()
        settings.temperature = request.temperature
        settings.top_k = request.top_k
        settings.top_p = request.top_p
        settings.max_tokens = request.max_tokens
        settings.repeat_penalty = request.repeat_penalty

        messages = BasicChatHistory()

        # Add history
        for user_msg, assistant_msg in request.history:
            messages.add_message({"role": Roles.user, "content": user_msg})
            messages.add_message({"role": Roles.assistant, "content": assistant_msg})

        # Get response
        response = agent.get_chat_response(
            request.message,
            llm_sampling_settings=settings,
            chat_history=messages,
            print_output=False,
        )

        return {"response": response}

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)