Spaces:

johnpaulbin
/

googoo

Sleeping

App Files Files Community

googoo / app.py

johnpaulbin

Create app.py

7f36089 verified 3 months ago

raw

history blame

1.72 kB

	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	import gradio as gr

	# Download the model from Hugging Face
	model_name = "johnpaulbin/articulate-V1-Q8_0-GGUF"
	model_file = "articulate-V1-Q8_0.gguf" # Replace with the actual GGUF file name from the repository
	model_path = hf_hub_download(repo_id=model_name, filename=model_file)

	# Initialize the Llama model with llama-cpp-python
	llm = Llama(
	model_path=model_path,
	n_ctx=1024, # Context length (adjust as needed)
	n_threads=2, # Number of CPU threads
	n_gpu_layers=0 # Run on CPU only (no GPU in free Spaces tier)
	)

	# Define the chat function for Gradio
	def chat(message, history):
	# Build the message list with history and current user input
	messages = []
	for user_msg, assistant_msg in history:
	messages.append({"role": "user", "content": user_msg})
	messages.append({"role": "assistant", "content": assistant_msg})
	messages.append({"role": "user", "content": message})

	# Perform inference with greedy decoding
	response = llm.create_chat_completion(
	messages=messages,
	max_tokens=100, # Limit output length
	top_k=1, # Greedy decoding: select the top token
	temperature=0.01 # Low temperature for determinism (top_k=1 is sufficient)
	)

	# Extract and return the generated text
	generated_text = response['choices'][0]['message']['content']
	return generated_text

	# Create the Gradio ChatInterface
	iface = gr.ChatInterface(
	fn=chat,
	title="Articulate V1 Chatbot",
	description="Chat with the Articulate V1 model (Llama 3-based) using greedy decoding."
	)

	# Launch the app
	iface.launch()