import gradio as gr import spaces import torch import transformers import torch from transformers import AutoModelForCausalLM, AutoTokenizer # model_name = "meta-llama/Meta-Llama-3-8B-Instruct" model_name = "mistralai/Mistral-7B-Instruct-v0.2" pipeline = transformers.pipeline( "text-generation", model=model_name, model_kwargs={"torch_dtype": torch.bfloat16}, device="cpu", ) def chat_function(message, history, system_prompt, max_new_tokens, temperature): messages = [] # Check if history is None or empty and handle accordingly if history: for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": assistant_msg}) # Always add the current user message messages.append({"role": "user", "content": message}) # Construct the prompt using the pipeline's tokenizer prompt = pipeline.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Generate the response terminators = [ pipeline.tokenizer.eos_token_id, pipeline.tokenizer.convert_tokens_to_ids("") ] # Adjust the temperature slightly above given to ensure variety adjusted_temp = temperature + 0.1 # Generate outputs with adjusted parameters outputs = pipeline( prompt, max_new_tokens=max_new_tokens, do_sample=True, temperature=adjusted_temp, top_p=0.9 ) # Extract the generated text, skipping the length of the prompt generated_text = outputs[0]["generated_text"] return generated_text[len(prompt):] # Return the new part of the conversation # Update Gradio interface setup gr.Interface( fn=chat_function, inputs=[ gr.Textbox(placeholder="Enter your message here", label="Your Message"), gr.JSON(label="Conversation History (format as [[user, assistant], ...])"), # Without optional gr.Textbox(label="System Prompt"), gr.Slider(512, 4096, label="Max New Tokens"), gr.Slider(0.0, 1.0, step=0.1, label="Temperature") ], outputs=gr.Textbox(label="AI Response") ).launch() # def chat_function(message, history, system_prompt,max_new_tokens,temperature): # messages = [ # {"role": "system", "content": system_prompt}, # {"role": "user", "content": message}, # ] # prompt = pipeline.tokenizer.apply_chat_template( # messages, # tokenize=False, # add_generation_prompt=True # ) # terminators = [ # pipeline.tokenizer.eos_token_id, # pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") # ] # temp = temperature + 0.1 # outputs = pipeline( # prompt, # max_new_tokens=max_new_tokens, # eos_token_id=terminators, # do_sample=True, # temperature=temp, # top_p=0.9, # ) # return outputs[0]["generated_text"][len(prompt):] # gr.ChatInterface( # chat_function, # chatbot=gr.Chatbot(height=400), # textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=7), # title="Meta-Llama-3-8B-Instruct", # description=""" # To Learn about Fine-tuning Llama-3-8B, Ckeck https://exnrt.com/blog/ai/finetune-llama3-8b/. # """, # additional_inputs=[ # gr.Textbox("You are helpful AI.", label="System Prompt"), # gr.Slider(512, 4096, label="Max New Tokens"), # gr.Slider(0, 1, label="Temperature") # ] # ).launch() #The Code # import gradio as gr # import os # import spaces # from transformers import GemmaTokenizer, AutoModelForCausalLM # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer # from threading import Thread # # Set an environment variable # HF_TOKEN = os.environ.get("HF_TOKEN", None) # DESCRIPTION = ''' #
#

Meta Llama3 8B

#

This Space demonstrates the instruction-tuned model Meta Llama3 8b Chat. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!

#

šŸ”Ž For more details about the Llama3 release and how to use the model with transformers, take a look at our blog post.

#

šŸ¦• Looking for an even more powerful model? Check out the Hugging Chat integration for Meta Llama 3 70b

#
# ''' # LICENSE = """ #

# --- # Built with Meta Llama 3 # """ # PLACEHOLDER = """ #

# #

Meta llama3

#

Ask me anything...

#
# """ # css = """ # h1 { # text-align: center; # display: block; # } # #duplicate-button { # margin: auto; # color: white; # background: #1565c0; # border-radius: 100vh; # } # """ # # Load the tokenizer and model # tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") # model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto") # to("cuda:0") # terminators = [ # tokenizer.eos_token_id, # tokenizer.convert_tokens_to_ids("<|eot_id|>") # ] # @spaces.GPU(duration=120) # def chat_llama3_8b(message: str, # history: list, # temperature: float, # max_new_tokens: int # ) -> str: # """ # Generate a streaming response using the llama3-8b model. # Args: # message (str): The input message. # history (list): The conversation history used by ChatInterface. # temperature (float): The temperature for generating the response. # max_new_tokens (int): The maximum number of new tokens to generate. # Returns: # str: The generated response. # """ # conversation = [] # for user, assistant in history: # conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]) # conversation.append({"role": "user", "content": message}) # input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device) # streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) # generate_kwargs = dict( # input_ids= input_ids, # streamer=streamer, # max_new_tokens=max_new_tokens, # do_sample=True, # temperature=temperature, # eos_token_id=terminators, # ) # # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash. # if temperature == 0: # generate_kwargs['do_sample'] = False # t = Thread(target=model.generate, kwargs=generate_kwargs) # t.start() # outputs = [] # for text in streamer: # outputs.append(text) # print(outputs) # yield "".join(outputs) # # Gradio block # chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface') # with gr.Blocks(fill_height=True, css=css) as demo: # gr.Markdown(DESCRIPTION) # gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button") # gr.ChatInterface( # fn=chat_llama3_8b, # chatbot=chatbot, # fill_height=True, # additional_inputs_accordion=gr.Accordion(label="āš™ļø Parameters", open=False, render=False), # additional_inputs=[ # gr.Slider(minimum=0, # maximum=1, # step=0.1, # value=0.95, # label="Temperature", # render=False), # gr.Slider(minimum=128, # maximum=4096, # step=1, # value=512, # label="Max new tokens", # render=False ), # ], # examples=[ # ['How to setup a human base on Mars? Give short answer.'], # ['Explain theory of relativity to me like I’m 8 years old.'], # ['What is 9,000 * 9,000?'], # ['Write a pun-filled happy birthday message to my friend Alex.'] # ], # cache_examples=False, # ) # gr.Markdown(LICENSE) # if __name__ == "__main__": # demo.launch()