File size: 2,947 Bytes
1c32556
6498ae3
ec89555
1e3869c
1c32556
 
 
7e5beaf
ec4d6e3
7cfaf27
 
96ba47c
 
 
7cfaf27
04b933e
96ba47c
7cfaf27
 
 
 
 
 
 
 
 
 
 
6be7d23
ea4dc0b
7cfaf27
1c32556
 
 
 
5358a38
fa11edf
96ba47c
fa11edf
 
 
96ba47c
fa11edf
 
ea4dc0b
 
590d966
bb74629
ea4dc0b
590d966
 
 
 
 
 
 
 
1c32556
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr
import datetime

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("bunnycore/Chimera-Apex-7B")
model = AutoModelForCausalLM.from_pretrained("bunnycore/Chimera-Apex-7B")

def format_prompt(message, history):
    prompt = "<s>"
    for user_prompt, bot_response in history:
        prompt += f"[INST] {user_prompt} [/INST]"
        prompt += f" {bot_response}</s> "
    prompt += f"[INST] {message} [/INST]"
    return prompt

def generate(prompt, history, system_prompt, temperature=0.9, max_new_tokens=9048, top_p=0.95, repetition_penalty=1.0):
    temperature = max(float(temperature), 1e-2)
    top_p = float(top_p)
    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )
    now = datetime.datetime.now()
    formatted_time = now.strftime("%H:%M:%S, %B %d, %Y")
    system_prompt = f"System time: {formatted_time}. System time: {formatted_time}. Instructions: Everything else is from the user. You are Milo, an AI assistant created by ConvoLite in 2024 (he/him). Be friendly and empathetic, matching the user's tone. Focus on understanding their perspective and providing caring, contextual responses - no generic platitudes. Keep it conversational, not overly formal."
    formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
    inputs = tokenizer.encode(formatted_prompt, return_tensors="pt")
    outputs = model.generate(inputs, **generate_kwargs)
    output = tokenizer.decode(outputs[0])
    yield output

additional_inputs = [
    gr.Textbox(label="System Prompt", max_lines=1, interactive=True),
    gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
    gr.Slider(label="Max new tokens", value=9048, minimum=256, maximum=9048, step=64, interactive=True, info="The maximum numbers of new tokens"),
    gr.Slider(label="Top-p (nucleus sampling)", value=0.90, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
    gr.Slider(label="Repetition penalty", value=1.2, minimum=1.0, maximum=2.0, step=0.05, interactive=True, info="Penalize repeated tokens")
]

avatar_images = ("https://i.postimg.cc/pXjKKVXG/user-circle.png", "https://i.postimg.cc/qq04Yz93/CL3.png")

gr.ChatInterface(
    fn=generate,
    chatbot=gr.Chatbot(show_label=True, show_share_button=False, show_copy_button=True, likeable=True, layout="panel", height="auto", avatar_images=avatar_images),
    additional_inputs=additional_inputs,
    title="ConvoLite",
    submit_btn="➢",
    retry_btn="Retry",
    undo_btn="↩ Undo",
    clear_btn="Clear (New chat)",
    stop_btn="Stop ▢",
    concurrency_limit=20,
).launch(show_api=False)