File size: 3,713 Bytes
4df759f 3dc4061 ab0f3d8 0cea479 4df759f e7f15bf 3dc4061 40a6a4b 1f03a85 d75506d fae0e14 46b7e93 d75506d e4a1a3c 03c59e6 d75506d 03c59e6 46b7e93 0cea479 42eab30 12d816c 42eab30 1f03a85 42eab30 664a2c2 d992640 ea3b3e9 46b7e93 b01335d ea3b3e9 ab0f3d8 657cd12 ea3b3e9 d75506d ea3b3e9 d75506d 657cd12 d75506d e4a1a3c ea3b3e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
model_name = "berkeley-nest/Starling-LM-7B-alpha"
title = """# 👋🏻Welcome to Tonic's 💫🌠Starling 7B"""
description = """You can use [💫🌠Starling 7B](https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha) or duplicate it for local use or on Hugging Face! [Join me on Discord to build together](https://discord.gg/VqTxc76K3u)."""
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import gradio as gr
import json
import os
import shutil
import requests
import accelerate
import bitsandbytes
# device = "cuda" if torch.cuda.is_available() else "cpu"
bos_token_id = 1,
eos_token_id = 32000
pad_token_id = 32001
temperature=0.4
max_new_tokens=240
top_p=0.92
repetition_penalty=1.7
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.eval()
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'
class StarlingBot:
def __init__(self, system_prompt="I am Starling-7B by Tonic-AI, I ready to do anything to help my user."):
self.system_prompt = system_prompt
def predict(self, user_message, assistant_message, system_prompt, do_sample, temperature=0.4, max_new_tokens=700, top_p=0.99, repetition_penalty=1.9):
try:
conversation = f" <s> [INST] {self.system_prompt} [INST] {assistant_message if assistant_message else ''} </s> [/INST] {user_message} </s> "
input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=True)
input_ids = input_ids.to(device)
response = model.generate(
input_ids=input_ids,
use_cache=True,
early_stopping=False,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
temperature=temperature,
do_sample=True,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty
)
response_text = tokenizer.decode(response[0], skip_special_tokens=True)
# response_text = response.split("<|assistant|>\n")[-1]
return response_text
finally:
del input_ids, attention_mask, output_ids
gc.collect()
torch.cuda.empty_cache()
starling_bot = StarlingBot()
examples = [
[
"The following dialogue is a conversation between Emmanuel Macron and Elon Musk:", # user_message
"[Emmanuel Macron]: Hello Mr. Musk. Thank you for receiving me today.", # assistant_message
0.9, # temperature
450, # max_new_tokens
0.90, # top_p
1.9, # repetition_penalty
]
]
iface = gr.Interface(
fn=starling_bot.predict,
gr.Markdown(title)
gr.Markdown(description)
inputs=[
gr.Textbox(label="🌟🤩User Message", type="text", lines=5),
gr.Textbox(label="💫🌠Starling Assistant Message or Instructions ", lines=2),
gr.Textbox(label="💫🌠Starling System Prompt or Instruction", lines=2),
gr.Checkbox(label="Advanced", value=False),
gr.Slider(label="Temperature", value=0.7, minimum=0.05, maximum=1.0, step=0.05),
gr.Slider(label="Max new tokens", value=100, minimum=25, maximum=256, step=1),
gr.Slider(label="Top-p (nucleus sampling)", value=0.90, minimum=0.01, maximum=0.99, step=0.05),
gr.Slider(label="Repetition penalty", value=1.9, minimum=1.0, maximum=2.0, step=0.05)
],
outputs="text",
theme="ParityError/Anime"
) |