|
import gradio as gr |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
import torch |
|
|
|
|
|
model_name = "meta-llama/Llama-2-7b-hf" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) |
|
model = model.to("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
def generate_response(user_input, chat_history): |
|
|
|
chat_history.append({"role": "user", "content": user_input}) |
|
|
|
|
|
conversation = "" |
|
for turn in chat_history: |
|
conversation += f"{turn['role']}: {turn['content']}\n" |
|
inputs = tokenizer(conversation, return_tensors="pt").to(model.device) |
|
|
|
|
|
outputs = model.generate(inputs.input_ids, max_length=500, do_sample=True, temperature=0.7) |
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
chat_history.append({"role": "assistant", "content": response}) |
|
|
|
|
|
return response, chat_history |
|
|
|
|
|
chat_history = [] |
|
|
|
|
|
with gr.Blocks() as chat_interface: |
|
gr.Markdown("## LLaMA-2 Chatbot") |
|
chat_input = gr.Textbox(label="Your Message") |
|
chat_output = gr.Chatbot() |
|
|
|
|
|
def handle_input(user_input): |
|
response, chat_history = generate_response(user_input, chat_history) |
|
chat_output.update(chat_history) |
|
return "", chat_history |
|
|
|
chat_input.submit(handle_input, inputs=chat_input, outputs=[chat_input, chat_output]) |
|
|
|
|
|
chat_interface.launch() |
|
|