File size: 4,655 Bytes
4df759f 3dc4061 ab0f3d8 0cea479 4df759f e7f15bf 80af65b 3dc4061 64fadcb 1f03a85 d75506d fae0e14 46b7e93 d75506d e4a1a3c 03c59e6 d75506d 03c59e6 588f72e 0cea479 42eab30 12d816c 42eab30 1f03a85 42eab30 664a2c2 618ecb4 664a2c2 d992640 46b7e93 72efb02 ddffd15 670623f 72efb02 ddffd15 72efb02 8aaf099 66e8238 72efb02 670623f 72efb02 b01335d 72efb02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
model_name = "berkeley-nest/Starling-LM-7B-alpha"
title = """# 👋🏻Welcome to Tonic's 💫🌠Starling 7B"""
description = """You can use [💫🌠Starling 7B](https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha) or duplicate it for local use or on Hugging Face! [Join me on Discord to build together](https://discord.gg/VqTxc76K3u)."""
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import gradio as gr
import json
import os
import shutil
import requests
import accelerate
import bitsandbytes
import gc
device = "cuda" if torch.cuda.is_available() else "cpu"
bos_token_id = 1,
eos_token_id = 32000
pad_token_id = 32001
temperature=0.4
max_new_tokens=240
top_p=0.92
repetition_penalty=1.7
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.eval()
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'
class StarlingBot:
def __init__(self, system_prompt="I am Starling-7B by Tonic-AI, I ready to do anything to help my user."):
self.system_prompt = system_prompt
def predict(self, user_message, assistant_message, system_prompt, do_sample, temperature=0.4, max_new_tokens=700, top_p=0.99, repetition_penalty=1.9):
try:
conversation = f" GPT4 Correct Assistant: {system_prompt if system_prompt else self.system_prompt} <|end_of_turn|> GPT4 Correct Assistant: {assistant_message if assistant_message else ''} <|end_of_turn|> GPT4 Correct User: {user_message} <|end_of_turn|> GPT4 Correct Assistant:"
input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=True)
input_ids = input_ids.to(device)
response = model.generate(
input_ids=input_ids,
use_cache=True,
early_stopping=False,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
temperature=temperature,
do_sample=True,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty
)
response_text = tokenizer.decode(response[0], skip_special_tokens=True)
# response_text = response.split("<|assistant|>\n")[-1]
return response_text
finally:
del input_ids
gc.collect()
torch.cuda.empty_cache()
examples = [
[
"The following dialogue is a conversation between Emmanuel Macron and Elon Musk:", # user_message
"[Emmanuel Macron]: Hello Mr. Musk. Thank you for receiving me today.", # assistant_message
0.9, # temperature
450, # max_new_tokens
0.90, # top_p
1.9, # repetition_penalty
]
]
# Initialize StarlingBot
starling_bot = StarlingBot()
def gradio_starling(user_message, assistant_message, system_prompt, do_sample, temperature, max_new_tokens, top_p, repetition_penalty):
response = starling_bot.predict(user_message, assistant_message, system_prompt, do_sample, temperature, max_new_tokens, top_p, repetition_penalty)
return response
with gr.Blocks(theme="ParityError/Anime") as demo:
gr.Markdown(title)
gr.Markdown(description)
with gr.Row():
system_prompt = gr.Textbox(label="Optional💫🌠Starling System Prompt", lines=2)
assistant_message = gr.Textbox(label="💫🌠Starling Assistant Message", lines=2)
user_message = gr.Textbox(label="Your Message", lines=3)
with gr.Row():
do_sample = gr.Checkbox(label="Advanced", value=True)
with gr.Accordion("Advanced Settings", open=lambda do_sample: do_sample):
with gr.Row():
temperature = gr.Slider(label="Temperature", value=0.4, minimum=0.05, maximum=1.0, step=0.05)
max_new_tokens = gr.Slider(label="Max new tokens", value=100, minimum=25, maximum=800, step=1)
top_p = gr.Slider(label="Top-p (nucleus sampling)", value=3.6, minimum=1.0, maximum=4.0, step=0.1)
repetition_penalty = gr.Slider(label="Repetition penalty", value=1.9, minimum=1.0, maximum=2.0, step=0.05)
submit_button = gr.Button("Submit")
output_text = gr.Textbox(label="💫🌠Starling Response")
submit_button.click(
gradio_starling,
inputs=[user_message, assistant_message, system_prompt, do_sample, temperature, max_new_tokens, top_p, repetition_penalty],
outputs=output_text
)
demo.launch() |