Spaces:
Sleeping
Sleeping
import gradio as gr | |
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
import os | |
# --- Model Configuration --- | |
# The Hugging Face model repository ID | |
MODEL_REPO_ID = "mradermacher/Sam-reason-v3-GGUF" | |
# The specific GGUF filename within that repository | |
MODEL_FILENAME = "Sam-reason-v3.Q4_K_M.gguf" | |
# Maximum context window for the model (how much text it can 'remember') | |
# Adjust this based on your needs and available memory. | |
N_CTX = 2048 | |
# Maximum number of tokens the model will generate in a single response | |
MAX_TOKENS = 500 | |
# Temperature for generation: higher values (e.g., 0.8-1.0) make output more random, | |
# lower values (e.g., 0.2-0.5) make it more focused. | |
TEMPERATURE = 0.7 | |
# Top-p sampling: controls diversity. Lower values focus on more probable tokens. | |
TOP_P = 0.9 | |
# Stop sequences: the model will stop generating when it encounters any of these strings. | |
# This prevents it from generating further turns or excessive boilerplate. | |
STOP_SEQUENCES = ["USER:", "\n\n"] | |
# --- Model Loading --- | |
print(f"Downloading model: {MODEL_FILENAME} from {MODEL_REPO_ID}...") | |
try: | |
# Download the GGUF model file from Hugging Face Hub | |
model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME) | |
print(f"Model downloaded to: {model_path}") | |
except Exception as e: | |
print(f"Error downloading model: {e}") | |
# Exit or handle the error appropriately if the model can't be downloaded | |
exit(1) | |
print("Initializing Llama model (this may take a moment)...") | |
try: | |
# Initialize the Llama model | |
# n_gpu_layers=0 ensures the model runs entirely on the CPU, | |
# which is necessary for the free tier on Hugging Face Spaces. | |
llm = Llama( | |
model_path=model_path, | |
n_gpu_layers=0, # Force CPU usage | |
n_ctx=N_CTX, # Set context window size | |
verbose=False # Suppress llama_cpp verbose output | |
) | |
print("Llama model initialized successfully.") | |
except Exception as e: | |
print(f"Error initializing Llama model: {e}") | |
exit(1) | |
# --- Inference Function --- | |
def generate_word_by_word(prompt_text: str): | |
""" | |
Generates text from the LLM word by word (or token by token) and yields the output. | |
This provides a streaming experience in the Gradio UI and for API calls. | |
""" | |
# Define the prompt template. This model does not specify a strict chat format, | |
# so a simple instruction-following format is used. | |
formatted_prompt = f"USER: {prompt_text}\nASSISTANT:" | |
print(f"Starting generation for prompt: '{prompt_text[:50]}...'") | |
output_tokens = [] | |
try: | |
# Use the create_completion method with stream=True for token-by-token generation | |
for chunk in llm.create_completion( | |
formatted_prompt, | |
max_tokens=MAX_TOKENS, | |
stop=STOP_SEQUENCES, | |
stream=True, | |
temperature=TEMPERATURE, | |
top_p=TOP_P, | |
): | |
token = chunk["choices"][0]["text"] | |
output_tokens.append(token) | |
# Yield the accumulated text to update the UI/API response in real-time | |
yield "".join(output_tokens) | |
except Exception as e: | |
print(f"Error during text generation: {e}") | |
yield f"An error occurred during generation: {e}" | |
# --- Gradio Interface --- | |
# Create the Gradio Interface for the web UI and API endpoint | |
iface = gr.Interface( | |
fn=generate_word_by_word, | |
inputs=gr.Textbox( | |
lines=5, | |
label="Enter your prompt here:", | |
placeholder="e.g., Explain the concept of quantum entanglement in simple terms." | |
), | |
outputs=gr.Textbox(label="Generated Text", show_copy_button=True), | |
title="SmilyAI: Sam-reason-v3-GGUF Word-by-Word Inference (CPU)", | |
description=( | |
"Enter a prompt and get a word-by-word response from the " | |
"Sam-reason-v3-GGUF model, running on Hugging Face Spaces' free CPU tier. " | |
"The response will stream as it's generated." | |
), | |
live=True, # Enable live streaming updates in the UI | |
api_name="predict", # Expose this function as a REST API endpoint | |
theme=gr.themes.Soft(), # A modern, soft theme for better aesthetics | |
) | |
# Launch the Gradio application | |
if __name__ == "__main__": | |
print("Launching Gradio app...") | |
iface.launch(server_name="0.0.0.0", server_port=7860) # Standard ports for HF Spaces | |