import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import gradio as gr model_name = "codellama/CodeLlama-7b-Instruct-hf" print("Loading model...") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto") print("Model loaded.") generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, temperature=0.1, top_p=0.95, max_new_tokens=512, repetition_penalty=1.05) def format_prompt(chat): prompt = "" for user_msg, ai_reply in chat: prompt += f"[INST] {user_msg.strip()} [/INST] {ai_reply.strip()}\n" return prompt def chat_fn(user_input, history): history = history or [] prompt = format_prompt(history + [[user_input, ""]]) generated = generator(prompt, do_sample=True)[0]["generated_text"] answer = generated[len(prompt):].strip() history.append((user_input, answer)) return "", history with gr.Blocks() as demo: gr.Markdown("# 🦙 CodeLlama Copilot\nFree & private code assistant.") chatbot = gr.Chatbot(label="Developer Assistant", height=400, type="messages") with gr.Row(): msg = gr.Textbox(placeholder="Ask me coding questions", show_label=False, container=False) clear = gr.Button("🔄 Clear Conversation") msg.submit(chat_fn, [msg, chatbot], [msg, chatbot]) clear.click(lambda: ("", []), None, [msg, chatbot]) if __name__ == "__main__": demo.launch()