Tttt / app.py
Athspi's picture
Create app.py
c21b225 verified
raw
history blame
1.83 kB
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load model and tokenizer
model_id = "suayptalha/FastLlama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto"
)
# System prompt
system_prompt = "You are a friendly assistant named FastLlama."
def format_prompt(message: str, history: list):
prompt = f"<|system|>\n{system_prompt}</s>\n"
for user_msg, bot_msg in history:
prompt += f"<|user|>\n{user_msg}</s>\n<|assistant|>\n{bot_msg}</s>\n"
prompt += f"<|user|>\n{message}</s>\n<|assistant|>\n"
return prompt
def respond(message: str, history: list):
# Format the prompt with chat history
full_prompt = format_prompt(message, history)
# Tokenize input
inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
# Generate response
output = model.generate(
inputs.input_ids,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# Decode response
response = tokenizer.decode(
output[0][inputs.input_ids.shape[-1]:],
skip_special_tokens=True
)
return response
# Create chat interface
chat = gr.ChatInterface(
fn=respond,
title="FastLlama-3.2B Chat",
description="Chat with FastLlama-3.2-3B-Instruct AI assistant",
examples=[
["Explain quantum computing in simple terms"],
["Write a poem about artificial intelligence"],
["What's the meaning of life?"]
],
cache_examples=False
)
# Launch the app
if __name__ == "__main__":
chat.launch(server_name="0.0.0.0")