Llama-3.1

Sleeping

App Files Files Community

Llama-3.1 / app.py

Nymbo

Update app.py

c20c4dd verified 8 months ago

raw

history blame

6.01 kB

	import gradio as gr
	from openai import OpenAI
	import os

	# Retrieve the access token from the environment variable
	ACCESS_TOKEN = os.getenv("HF_TOKEN")
	print("Access token loaded.")

	# Initialize the OpenAI client with the Hugging Face Inference API endpoint
	client = OpenAI(
	base_url="https://api-inference.huggingface.co/v1/",
	api_key=ACCESS_TOKEN,
	)
	print("OpenAI client initialized.")

	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	frequency_penalty,
	seed,
	model,
	custom_model
	):
	"""
	Handles the chatbot response with given parameters.
	"""
	print(f"Received message: {message}")
	print(f"History: {history}")
	print(f"System message: {system_message}")
	print(f"Model: {model}, Custom Model: {custom_model}")

	# Use custom model if provided, else use selected model
	selected_model = custom_model.strip() if custom_model.strip() else model
	print(f"Selected model: {selected_model}")

	# Construct the messages array required by the API
	messages = [{"role": "system", "content": system_message}]

	# Add conversation history to the context
	for val in history:
	user_part = val[0]
	assistant_part = val[1]
	if user_part:
	messages.append({"role": "user", "content": user_part})
	print(f"Added user message to context: {user_part}")
	if assistant_part:
	messages.append({"role": "assistant", "content": assistant_part})
	print(f"Added assistant message to context: {assistant_part}")

	# Append the latest user message
	messages.append({"role": "user", "content": message})

	# Start with an empty string to build the response as tokens stream in
	response = ""
	print("Sending request to OpenAI API.")

	# Make the streaming request to the HF Inference API via OpenAI-like client
	for message_chunk in client.chat.completions.create(
	model=selected_model,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	frequency_penalty=frequency_penalty,
	seed=seed if seed != -1 else None,
	messages=messages,
	):
	# Extract the token text from the response chunk
	token_text = message_chunk.choices[0].delta.content
	print(f"Received token: {token_text}")
	response += token_text
	yield response

	print("Completed response generation.")

	# Create a Chatbot component
	chatbot = gr.Chatbot(height=600)
	print("Chatbot interface created.")

	# Define the featured models for the dropdown
	models_list = [
	"meta-llama/Llama-3.3-70B-Instruct",
	"bigscience/bloom-176b",
	"gpt-j-6b",
	"opt-30b",
	"flan-t5-xxl",
	]

	# Function to filter models based on user input
	def filter_models(search_term):
	return [m for m in models_list if search_term.lower() in m.lower()]

	# Gradio interface
	with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
	with gr.Row():
	chatbot = gr.Chatbot(height=600)

	with gr.Tab("Chat Interface"):
	with gr.Row():
	user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...")
	with gr.Row():
	system_message = gr.Textbox(value="", label="System Message")
	with gr.Row():
	max_tokens = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max Tokens")
	temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
	with gr.Row():
	top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-P")
	frequency_penalty = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
	seed = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")
	with gr.Row():
	model = gr.Dropdown(label="Select a Model", choices=models_list, value="meta-llama/Llama-3.3-70B-Instruct")
	custom_model = gr.Textbox(label="Custom Model", placeholder="Enter custom model path")
	with gr.Row():
	run_button = gr.Button("Generate Response")

	with gr.Tab("Information"):
	with gr.Accordion("Featured Models", open=False):
	gr.HTML(
	"""
	<table>
	<tr><th>Model Name</th><th>Description</th></tr>
	<tr><td>meta-llama/Llama-3.3-70B-Instruct</td><td>Instruction-tuned LLaMA model</td></tr>
	<tr><td>bigscience/bloom-176b</td><td>Multilingual large language model</td></tr>
	<tr><td>gpt-j-6b</td><td>Open-source GPT model</td></tr>
	<tr><td>opt-30b</td><td>Meta's OPT model</td></tr>
	<tr><td>flan-t5-xxl</td><td>Google's Flan-tuned T5 XXL</td></tr>
	</table>
	"""
	)
	with gr.Accordion("Parameters Overview", open=False):
	gr.Markdown(
	"""
	### Parameters Overview
	- Max Tokens: Maximum number of tokens in the response.
	- Temperature: Controls the randomness of responses. Lower values make the output more deterministic.
	- Top-P: Controls the diversity of responses by limiting the token selection to a probability mass.
	- Frequency Penalty: Penalizes repeated tokens in the output.
	- Seed: Fixes randomness for reproducibility. Use -1 for a random seed.
	"""
	)

	run_button.click(
	respond,
	inputs=[
	user_input,
	chatbot.state,
	system_message,
	max_tokens,
	temperature,
	top_p,
	frequency_penalty,
	seed,
	model,
	custom_model
	],
	outputs=chatbot
	)

	print("Launching the demo application.")
	demo.launch()