File size: 2,754 Bytes
1556304
 
 
 
130e4a8
1556304
 
 
 
 
 
5bd9cae
 
1556304
 
 
 
 
b241b47
1c84354
 
 
1556304
b241b47
 
 
1556304
e05cd4e
1c84354
b241b47
1556304
 
 
 
 
 
 
130e4a8
b241b47
 
e05cd4e
130e4a8
84029e4
b241b47
 
130e4a8
b241b47
1556304
130e4a8
 
b241b47
130e4a8
b241b47
e05cd4e
b241b47
 
1556304
130e4a8
b241b47
 
 
 
 
 
1556304
 
 
b241b47
 
 
 
1556304
130e4a8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

DESCRIPTION = """\
# L-MChat
This Space demonstrates [L-MChat](https://huggingface.co/collections/Artples/l-mchat-663265a8351231c428318a8f) by L-AI.
"""

if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU! This demo does not work on CPU.</p>"

model_details = {
    "Fast-Model": "Artples/L-MChat-Small",
    "Quality-Model": "Artples/L-MChat-7b"
}

models = {name: AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") for name, model_id in model_details.items()}
tokenizers = {name: AutoTokenizer.from_pretrained(model_id) for name, model_id in model_details.items()}

@spaces.GPU(enable_queue=True, duration=90)
def generate(
    model_choice: str,
    message: str,
    chat_history: list[tuple[str, str]],
    system_prompt: str,
    max_new_tokens: int = 1024,
    temperature: float = 0.1,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> str:
    model = models[model_choice]
    tokenizer = tokenizers[model_choice]

    conversation = [{"role": "system", "content": system_prompt}] if system_prompt else []
    conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}] for user, assistant in chat_history)
    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).input_ids
    input_ids = input_ids.to(model.device)

    output_ids = model.generate(input_ids, max_length=MAX_INPUT_TOKEN_LENGTH + max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return output_text


chat_interface = gr.ChatInterface(
    theme='ehristoforu/RE_Theme',
    fn=generate,
    additional_inputs=[gr.Textbox(label="System prompt", lines=6), gr.Dropdown(label="Model Choice", choices=list(model_details.keys()), value="Quality-Model")],
    examples=[
        ["Hello there! How are you doing?"],
        ["Can you explain briefly to me what is the Python programming language?"],
        ["Explain the plot of Cinderella in a sentence."],
        ["How many hours does it take a man to eat a Helicopter?"],
        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
    ],
)

with gr.Blocks(css="style.css") as demo:
    gr.Markdown(DESCRIPTION)
    chat_interface.render()

if __name__ == "__main__":
    demo.launch()