Tttt / app.py
Athspi's picture
Update app.py
aa37cb9 verified
raw
history blame
2.11 kB
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load model and tokenizer
model_id = "suayptalha/FastLlama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto"
)
# Explicitly set padding token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# System prompt
system_prompt = "You are a friendly assistant named FastLlama."
def format_prompt(message: str, history: list):
prompt = f"<|system|>\n{system_prompt}</s>\n"
for user_msg, bot_msg in history:
prompt += f"<|user|>\n{user_msg}</s>\n<|assistant|>\n{bot_msg}</s>\n"
prompt += f"<|user|>\n{message}</s>\n<|assistant|>\n"
return prompt
def respond(message: str, history: list):
# Format the prompt with chat history
full_prompt = format_prompt(message, history)
# Tokenize input with attention mask
inputs = tokenizer(
full_prompt,
return_tensors="pt",
padding=True,
truncation=True
).to(model.device)
# Generate response with attention mask
output = model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
do_sample=True,
pad_token_id=tokenizer.pad_token_id
)
# Decode response while skipping special tokens
response = tokenizer.decode(
output[0][inputs.input_ids.shape[-1]:],
skip_special_tokens=True
)
return response
# Create chat interface
chat = gr.ChatInterface(
fn=respond,
title="FastLlama-3.2B Chat",
description="Chat with FastLlama-3.2-3B-Instruct AI assistant",
examples=[
["Explain quantum computing in simple terms"],
["Write a poem about artificial intelligence"],
["What's the meaning of life?"]
],
cache_examples=False
)
if __name__ == "__main__":
chat.launch(server_name="0.0.0.0")