Spaces:
Running
Running
File size: 2,919 Bytes
52381a8 7aa27d3 52381a8 7aa27d3 52381a8 fee4ce3 52381a8 7aa27d3 52381a8 db9565d 52381a8 7aa27d3 52381a8 7aa27d3 52381a8 7aa27d3 52381a8 7aa27d3 52381a8 7aa27d3 52381a8 7aa27d3 52381a8 7aa27d3 52381a8 7aa27d3 52381a8 7aa27d3 52381a8 7aa27d3 52381a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import threading
import app_math as app_math # keeping your existing import
# ---- Model setup ----
HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
token=HF_TOKEN, # uses your HF token if needed
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=dtype,
low_cpu_mem_usage=True,
token=HF_TOKEN, # uses your HF token if needed
)
model.to(device)
# Ensure pad token is set for generation
if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
tokenizer.pad_token_id = tokenizer.eos_token_id
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
# Build chat messages with system + history + latest user message
messages = [{"role": "system", "content": system_message}]
for u, a in history:
if u:
messages.append({"role": "user", "content": u})
if a:
messages.append({"role": "assistant", "content": a})
messages.append({"role": "user", "content": message})
# Tokenize with Zephyr's chat template
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt",
).to(device)
# Stream generation
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
gen_kwargs = {
"inputs": inputs,
"max_new_tokens": int(max_tokens),
"do_sample": True,
"temperature": float(temperature),
"top_p": float(top_p),
"eos_token_id": tokenizer.eos_token_id,
"pad_token_id": tokenizer.pad_token_id,
"streamer": streamer,
}
# Run generation in a background thread so we can yield tokens as they arrive
thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
thread.start()
partial = ""
for new_text in streamer:
partial += new_text
yield partial
# ---- Gradio UI ----
# For information on how to customize the ChatInterface, peruse the gradio docs:
# https://www.gradio.app/docs/chatinterface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
],
)
if __name__ == "__main__":
demo.launch()
|