Spaces:

Smilyai-labs
/

Sam-S-3-api

Sleeping

Sam-S-3-api / app.py

Boning c

Create app.py

f38ab88 verified 3 months ago

4.34 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os

	# --- Model Configuration ---
	# The Hugging Face model repository ID
	MODEL_REPO_ID = "mradermacher/Sam-reason-v3-GGUF"
	# The specific GGUF filename within that repository
	MODEL_FILENAME = "Sam-reason-v3.Q4_K_M.gguf"
	# Maximum context window for the model (how much text it can 'remember')
	# Adjust this based on your needs and available memory.
	N_CTX = 2048
	# Maximum number of tokens the model will generate in a single response
	MAX_TOKENS = 500
	# Temperature for generation: higher values (e.g., 0.8-1.0) make output more random,
	# lower values (e.g., 0.2-0.5) make it more focused.
	TEMPERATURE = 0.7
	# Top-p sampling: controls diversity. Lower values focus on more probable tokens.
	TOP_P = 0.9
	# Stop sequences: the model will stop generating when it encounters any of these strings.
	# This prevents it from generating further turns or excessive boilerplate.
	STOP_SEQUENCES = ["USER:", "\n\n"]

	# --- Model Loading ---
	print(f"Downloading model: {MODEL_FILENAME} from {MODEL_REPO_ID}...")
	try:
	# Download the GGUF model file from Hugging Face Hub
	model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)
	print(f"Model downloaded to: {model_path}")
	except Exception as e:
	print(f"Error downloading model: {e}")
	# Exit or handle the error appropriately if the model can't be downloaded
	exit(1)

	print("Initializing Llama model (this may take a moment)...")
	try:
	# Initialize the Llama model
	# n_gpu_layers=0 ensures the model runs entirely on the CPU,
	# which is necessary for the free tier on Hugging Face Spaces.
	llm = Llama(
	model_path=model_path,
	n_gpu_layers=0, # Force CPU usage
	n_ctx=N_CTX, # Set context window size
	verbose=False # Suppress llama_cpp verbose output
	)
	print("Llama model initialized successfully.")
	except Exception as e:
	print(f"Error initializing Llama model: {e}")
	exit(1)

	# --- Inference Function ---
	def generate_word_by_word(prompt_text: str):
	"""
	Generates text from the LLM word by word (or token by token) and yields the output.
	This provides a streaming experience in the Gradio UI and for API calls.
	"""
	# Define the prompt template. This model does not specify a strict chat format,
	# so a simple instruction-following format is used.
	formatted_prompt = f"USER: {prompt_text}\nASSISTANT:"

	print(f"Starting generation for prompt: '{prompt_text[:50]}...'")
	output_tokens = []
	try:
	# Use the create_completion method with stream=True for token-by-token generation
	for chunk in llm.create_completion(
	formatted_prompt,
	max_tokens=MAX_TOKENS,
	stop=STOP_SEQUENCES,
	stream=True,
	temperature=TEMPERATURE,
	top_p=TOP_P,
	):
	token = chunk["choices"][0]["text"]
	output_tokens.append(token)
	# Yield the accumulated text to update the UI/API response in real-time
	yield "".join(output_tokens)
	except Exception as e:
	print(f"Error during text generation: {e}")
	yield f"An error occurred during generation: {e}"

	# --- Gradio Interface ---
	# Create the Gradio Interface for the web UI and API endpoint
	iface = gr.Interface(
	fn=generate_word_by_word,
	inputs=gr.Textbox(
	lines=5,
	label="Enter your prompt here:",
	placeholder="e.g., Explain the concept of quantum entanglement in simple terms."
	),
	outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
	title="SmilyAI: Sam-reason-v3-GGUF Word-by-Word Inference (CPU)",
	description=(
	"Enter a prompt and get a word-by-word response from the "
	"Sam-reason-v3-GGUF model, running on Hugging Face Spaces' free CPU tier. "
	"The response will stream as it's generated."
	),
	live=True, # Enable live streaming updates in the UI
	api_name="predict", # Expose this function as a REST API endpoint
	theme=gr.themes.Soft(), # A modern, soft theme for better aesthetics
	)

	# Launch the Gradio application
	if __name__ == "__main__":
	print("Launching Gradio app...")
	iface.launch(server_name="0.0.0.0", server_port=7860) # Standard ports for HF Spaces