File size: 4,337 Bytes
f38ab88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

# --- Model Configuration ---
# The Hugging Face model repository ID
MODEL_REPO_ID = "mradermacher/Sam-reason-v3-GGUF"
# The specific GGUF filename within that repository
MODEL_FILENAME = "Sam-reason-v3.Q4_K_M.gguf"
# Maximum context window for the model (how much text it can 'remember')
# Adjust this based on your needs and available memory.
N_CTX = 2048
# Maximum number of tokens the model will generate in a single response
MAX_TOKENS = 500
# Temperature for generation: higher values (e.g., 0.8-1.0) make output more random,
# lower values (e.g., 0.2-0.5) make it more focused.
TEMPERATURE = 0.7
# Top-p sampling: controls diversity. Lower values focus on more probable tokens.
TOP_P = 0.9
# Stop sequences: the model will stop generating when it encounters any of these strings.
# This prevents it from generating further turns or excessive boilerplate.
STOP_SEQUENCES = ["USER:", "\n\n"]

# --- Model Loading ---
print(f"Downloading model: {MODEL_FILENAME} from {MODEL_REPO_ID}...")
try:
    # Download the GGUF model file from Hugging Face Hub
    model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)
    print(f"Model downloaded to: {model_path}")
except Exception as e:
    print(f"Error downloading model: {e}")
    # Exit or handle the error appropriately if the model can't be downloaded
    exit(1)

print("Initializing Llama model (this may take a moment)...")
try:
    # Initialize the Llama model
    # n_gpu_layers=0 ensures the model runs entirely on the CPU,
    # which is necessary for the free tier on Hugging Face Spaces.
    llm = Llama(
        model_path=model_path,
        n_gpu_layers=0,  # Force CPU usage
        n_ctx=N_CTX,     # Set context window size
        verbose=False    # Suppress llama_cpp verbose output
    )
    print("Llama model initialized successfully.")
except Exception as e:
    print(f"Error initializing Llama model: {e}")
    exit(1)

# --- Inference Function ---
def generate_word_by_word(prompt_text: str):
    """
    Generates text from the LLM word by word (or token by token) and yields the output.
    This provides a streaming experience in the Gradio UI and for API calls.
    """
    # Define the prompt template. This model does not specify a strict chat format,
    # so a simple instruction-following format is used.
    formatted_prompt = f"USER: {prompt_text}\nASSISTANT:"

    print(f"Starting generation for prompt: '{prompt_text[:50]}...'")
    output_tokens = []
    try:
        # Use the create_completion method with stream=True for token-by-token generation
        for chunk in llm.create_completion(
            formatted_prompt,
            max_tokens=MAX_TOKENS,
            stop=STOP_SEQUENCES,
            stream=True,
            temperature=TEMPERATURE,
            top_p=TOP_P,
        ):
            token = chunk["choices"][0]["text"]
            output_tokens.append(token)
            # Yield the accumulated text to update the UI/API response in real-time
            yield "".join(output_tokens)
    except Exception as e:
        print(f"Error during text generation: {e}")
        yield f"An error occurred during generation: {e}"

# --- Gradio Interface ---
# Create the Gradio Interface for the web UI and API endpoint
iface = gr.Interface(
    fn=generate_word_by_word,
    inputs=gr.Textbox(
        lines=5,
        label="Enter your prompt here:",
        placeholder="e.g., Explain the concept of quantum entanglement in simple terms."
    ),
    outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
    title="SmilyAI: Sam-reason-v3-GGUF Word-by-Word Inference (CPU)",
    description=(
        "Enter a prompt and get a word-by-word response from the "
        "Sam-reason-v3-GGUF model, running on Hugging Face Spaces' free CPU tier. "
        "The response will stream as it's generated."
    ),
    live=True,  # Enable live streaming updates in the UI
    api_name="predict",  # Expose this function as a REST API endpoint
    theme=gr.themes.Soft(), # A modern, soft theme for better aesthetics
)

# Launch the Gradio application
if __name__ == "__main__":
    print("Launching Gradio app...")
    iface.launch(server_name="0.0.0.0", server_port=7860) # Standard ports for HF Spaces