Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,754 Bytes
1556304 130e4a8 1556304 5bd9cae 1556304 b241b47 1c84354 1556304 b241b47 1556304 e05cd4e 1c84354 b241b47 1556304 130e4a8 b241b47 e05cd4e 130e4a8 84029e4 b241b47 130e4a8 b241b47 1556304 130e4a8 b241b47 130e4a8 b241b47 e05cd4e b241b47 1556304 130e4a8 b241b47 1556304 b241b47 1556304 130e4a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import os
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
DESCRIPTION = """\
# L-MChat
This Space demonstrates [L-MChat](https://huggingface.co/collections/Artples/l-mchat-663265a8351231c428318a8f) by L-AI.
"""
if not torch.cuda.is_available():
DESCRIPTION += "\n<p>Running on CPU! This demo does not work on CPU.</p>"
model_details = {
"Fast-Model": "Artples/L-MChat-Small",
"Quality-Model": "Artples/L-MChat-7b"
}
models = {name: AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") for name, model_id in model_details.items()}
tokenizers = {name: AutoTokenizer.from_pretrained(model_id) for name, model_id in model_details.items()}
@spaces.GPU(enable_queue=True, duration=90)
def generate(
model_choice: str,
message: str,
chat_history: list[tuple[str, str]],
system_prompt: str,
max_new_tokens: int = 1024,
temperature: float = 0.1,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2,
) -> str:
model = models[model_choice]
tokenizer = tokenizers[model_choice]
conversation = [{"role": "system", "content": system_prompt}] if system_prompt else []
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}] for user, assistant in chat_history)
conversation.append({"role": "user", "content": message})
input_ids = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).input_ids
input_ids = input_ids.to(model.device)
output_ids = model.generate(input_ids, max_length=MAX_INPUT_TOKEN_LENGTH + max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return output_text
chat_interface = gr.ChatInterface(
theme='ehristoforu/RE_Theme',
fn=generate,
additional_inputs=[gr.Textbox(label="System prompt", lines=6), gr.Dropdown(label="Model Choice", choices=list(model_details.keys()), value="Quality-Model")],
examples=[
["Hello there! How are you doing?"],
["Can you explain briefly to me what is the Python programming language?"],
["Explain the plot of Cinderella in a sentence."],
["How many hours does it take a man to eat a Helicopter?"],
["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
],
)
with gr.Blocks(css="style.css") as demo:
gr.Markdown(DESCRIPTION)
chat_interface.render()
if __name__ == "__main__":
demo.launch()
|