import gradio as gr import spaces import torch import transformers import torch from transformers import AutoModelForCausalLM, AutoTokenizer # model_name = "meta-llama/Meta-Llama-3-8B-Instruct" model_name = "mistralai/Mistral-7B-Instruct-v0.2" pipeline = transformers.pipeline( "text-generation", model=model_name, model_kwargs={"torch_dtype": torch.bfloat16}, device="cpu", ) def chat_function(message, history, system_prompt, max_new_tokens, temperature): messages = [] # Check if history is None or empty and handle accordingly if history: for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": assistant_msg}) # Always add the current user message messages.append({"role": "user", "content": message}) # Construct the prompt using the pipeline's tokenizer prompt = pipeline.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Generate the response terminators = [ pipeline.tokenizer.eos_token_id, pipeline.tokenizer.convert_tokens_to_ids("") ] # Adjust the temperature slightly above given to ensure variety adjusted_temp = temperature + 0.1 # Generate outputs with adjusted parameters outputs = pipeline( prompt, max_new_tokens=max_new_tokens, do_sample=True, temperature=adjusted_temp, top_p=0.9 ) # Extract the generated text, skipping the length of the prompt generated_text = outputs[0]["generated_text"] return generated_text[len(prompt):] # Return the new part of the conversation # Update Gradio interface setup gr.Interface( fn=chat_function, inputs=[ gr.Textbox(placeholder="Enter your message here", label="Your Message"), gr.JSON(label="Conversation History (format as [[user, assistant], ...])"), # Without optional gr.Textbox(label="System Prompt"), gr.Slider(512, 4096, label="Max New Tokens"), gr.Slider(0.0, 1.0, step=0.1, label="Temperature") ], outputs=gr.Textbox(label="AI Response") ).launch() # def chat_function(message, history, system_prompt,max_new_tokens,temperature): # messages = [ # {"role": "system", "content": system_prompt}, # {"role": "user", "content": message}, # ] # prompt = pipeline.tokenizer.apply_chat_template( # messages, # tokenize=False, # add_generation_prompt=True # ) # terminators = [ # pipeline.tokenizer.eos_token_id, # pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") # ] # temp = temperature + 0.1 # outputs = pipeline( # prompt, # max_new_tokens=max_new_tokens, # eos_token_id=terminators, # do_sample=True, # temperature=temp, # top_p=0.9, # ) # return outputs[0]["generated_text"][len(prompt):] # gr.ChatInterface( # chat_function, # chatbot=gr.Chatbot(height=400), # textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=7), # title="Meta-Llama-3-8B-Instruct", # description=""" # To Learn about Fine-tuning Llama-3-8B, Ckeck https://exnrt.com/blog/ai/finetune-llama3-8b/. # """, # additional_inputs=[ # gr.Textbox("You are helpful AI.", label="System Prompt"), # gr.Slider(512, 4096, label="Max New Tokens"), # gr.Slider(0, 1, label="Temperature") # ] # ).launch() #The Code # import gradio as gr # import os # import spaces # from transformers import GemmaTokenizer, AutoModelForCausalLM # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer # from threading import Thread # # Set an environment variable # HF_TOKEN = os.environ.get("HF_TOKEN", None) # DESCRIPTION = ''' #
This Space demonstrates the instruction-tuned model Meta Llama3 8b Chat. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!
#š For more details about the Llama3 release and how to use the model with transformers
, take a look at our blog post.
š¦ Looking for an even more powerful model? Check out the Hugging Chat integration for Meta Llama 3 70b
#Ask me anything...
#