Spaces:

ginigen
/

Mistral-Perflexity

Running on Zero

File size: 6,955 Bytes

import spaces
import json
import subprocess
import os
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
import gradio as gr
from huggingface_hub import hf_hub_download

llm = None
llm_model = None

# 모델 이름과 경로를 정의
MISTRAL_MODEL_NAME = "Private-BitSix-Mistral-Small-3.1-24B-Instruct-2503.gguf"

# 모델 다운로드
model_path = hf_hub_download(
    repo_id="ginigen/Private-BitSix-Mistral-Small-3.1-24B-Instruct-2503",
    filename=MISTRAL_MODEL_NAME,
    local_dir="./models"
)

print(f"Downloaded model path: {model_path}")

css = """
.bubble-wrap {
    padding-top: calc(var(--spacing-xl) * 3) !important;
}
.message-row {
    justify-content: space-evenly !important;
    width: 100% !important;
    max-width: 100% !important;
    margin: calc(var(--spacing-xl)) 0 !important;
    padding: 0 calc(var(--spacing-xl) * 3) !important;
}
.flex-wrap.user {
    border-bottom-right-radius: var(--radius-lg) !important;
}
.flex-wrap.bot {
    border-bottom-left-radius: var(--radius-lg) !important;
}
.message.user{
    padding: 10px;
}
.message.bot{
    text-align: right;
    width: 100%;
    padding: 10px;
    border-radius: 10px;
}
.message-bubble-border {
    border-radius: 6px !important;
}
.message-buttons {
    justify-content: flex-end !important;
}
.message-buttons-left {
    align-self: end !important;
}
.message-buttons-bot, .message-buttons-user {
    right: 10px !important;
    left: auto !important;
    bottom: 2px !important;
}
.dark.message-bubble-border {
    border-color: #343140 !important;
}
.dark.user {
    background: #1e1c26 !important;
}
.dark.assistant.dark, .dark.pending.dark {
    background: #16141c !important;
}
"""

def get_messages_formatter_type(model_name):
    if "Mistral" in model_name or "BitSix" in model_name:
        return MessagesFormatterType.CHATML  # Mistral 계열 모델은 ChatML 형식 사용
    else:
        raise ValueError(f"Unsupported model: {model_name}")

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[dict],  # history 항목이 tuple이 아닌 dict 형식으로 전달됨
    system_message,
    max_tokens,
    temperature,
    top_p,
    top_k,
    repeat_penalty,
):
    global llm
    global llm_model
    
    chat_template = get_messages_formatter_type(MISTRAL_MODEL_NAME)
    
    # 모델 파일 경로 확인
    model_path_local = os.path.join("./models", MISTRAL_MODEL_NAME)
    
    print(f"Model path: {model_path_local}")
    
    if not os.path.exists(model_path_local):
        print(f"Warning: Model file not found at {model_path_local}")
        print(f"Available files in ./models: {os.listdir('./models')}")
    
    if llm is None or llm_model != MISTRAL_MODEL_NAME:
        llm = Llama(
            model_path=model_path_local,
            flash_attn=True,
            n_gpu_layers=81,
            n_batch=1024,
            n_ctx=8192,
        )
        llm_model = MISTRAL_MODEL_NAME
    
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt=f"{system_message}",
        predefined_messages_formatter_type=chat_template,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.temperature = temperature
    settings.top_k = top_k
    settings.top_p = top_p
    settings.max_tokens = max_tokens
    settings.repeat_penalty = repeat_penalty
    settings.stream = True

    messages = BasicChatHistory()

    # history의 각 항목이 dict 형식으로 {'user': <user_message>, 'assistant': <assistant_message>} 형태라고 가정
    for msn in history:
        user_message = {
            'role': Roles.user,
            'content': msn.get('user', '')
        }
        assistant_message = {
            'role': Roles.assistant,
            'content': msn.get('assistant', '')
        }
        messages.add_message(user_message)
        messages.add_message(assistant_message)
    
    stream = agent.get_chat_response(
        message,
        llm_sampling_settings=settings,
        chat_history=messages,
        returns_streaming_generator=True,
        print_output=False
    )
    
    outputs = ""
    for output in stream:
        outputs += output
        yield outputs


demo = gr.ChatInterface(
    fn=respond,
    title="Ginigen Private AI",
    description="6BIT 양자화로 모델 크기는 줄이고 성능은 유지하는 프라이버시 중심 AI 솔루션: The Ginigen Private-BitSix framework simplifies interactions with Large Language Models (LLMs), providing an interface for chatting, executing function calls, generating structured output, performing retrieval augmented generation, and processing text using agentic chains with tools.",
    theme=gr.themes.Soft(
        primary_hue="violet",
        secondary_hue="violet",
        neutral_hue="gray",
        font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]
    ).set(
        body_background_fill_dark="#16141c",
        block_background_fill_dark="#16141c",
        block_border_width="1px",
        block_title_background_fill_dark="#1e1c26",
        input_background_fill_dark="#292733",
        button_secondary_background_fill_dark="#24212b",
        border_color_accent_dark="#343140",
        border_color_primary_dark="#343140",
        background_fill_secondary_dark="#16141c",
        color_accent_soft_dark="transparent",
        code_background_fill_dark="#292733",
    ),
    css=css,
    examples=[
        ["안녕하세요, 저는 AI에 관심이 많습니다. 양자화란 무엇인가요?"],
        ["미스트랄 모델의 특징은 무엇인가요?"],
        ["긴 컨텍스트(context)를 처리하는 방법을 설명해 주세요."]
    ],
    additional_inputs=[
        gr.Textbox(
            value="You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering. You should enclose your thoughts and internal monologue inside tags, and then provide your solution or response to the problem.",
            label="시스템 메시지",
            lines=5
        ),
        gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="최대 토큰 수"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
        gr.Slider(minimum=0, maximum=100, value=40, step=1, label="Top-k"),
        gr.Slider(minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Repetition penalty"),
    ],
    chatbot=gr.Chatbot(type="messages")
)

if __name__ == "__main__":
    demo.launch()