import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM # Load model and tokenizer model_id = "suayptalha/FastLlama-3.2-3B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto" ) # Explicitly set padding token if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # System prompt system_prompt = "You are a friendly assistant named FastLlama." def format_prompt(message: str, history: list): prompt = f"<|system|>\n{system_prompt}\n" for user_msg, bot_msg in history: prompt += f"<|user|>\n{user_msg}\n<|assistant|>\n{bot_msg}\n" prompt += f"<|user|>\n{message}\n<|assistant|>\n" return prompt def respond(message: str, history: list): # Format the prompt with chat history full_prompt = format_prompt(message, history) # Tokenize input with attention mask inputs = tokenizer( full_prompt, return_tensors="pt", padding=True, truncation=True ).to(model.device) # Generate response with attention mask output = model.generate( inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=256, temperature=0.7, top_p=0.9, repetition_penalty=1.1, do_sample=True, pad_token_id=tokenizer.pad_token_id ) # Decode response while skipping special tokens response = tokenizer.decode( output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True ) return response # Create chat interface chat = gr.ChatInterface( fn=respond, title="FastLlama-3.2B Chat", description="Chat with FastLlama-3.2-3B-Instruct AI assistant", examples=[ ["Explain quantum computing in simple terms"], ["Write a poem about artificial intelligence"], ["What's the meaning of life?"] ], cache_examples=False ) if __name__ == "__main__": chat.launch(server_name="0.0.0.0")