import gradio as gr import torch from beeper_model import BeeperRoseGPT, generate from tokenizers import Tokenizer from huggingface_hub import hf_hub_download from safetensors.torch import load_file as load_safetensors # ---------------------------- # 🔧 Model versions configuration # ---------------------------- MODEL_VERSIONS = { "Beeper v3 (Multi-Concept)": { "repo_id": "AbstractPhil/beeper-rose-v3", "model_file": "beeper_final.safetensors", "description": "Beeper v3 with 30+ epochs including reasoning, math, coding, and more." }, "Beeper v2 (Extended)": { "repo_id": "AbstractPhil/beeper-rose-v2", "model_file": "beeper_final.safetensors", "description": "Beeper v2 with extended training (~15 epochs)" }, "Beeper v1 (Original)": { "repo_id": "AbstractPhil/beeper-rose-tinystories-6l-512d-ctx512", "model_file": "beeper_rose_final.safetensors", "description": "Original Beeper trained on TinyStories" }, } # Base configuration config = { "context": 512, "vocab_size": 8192, "dim": 512, "n_heads": 8, "n_layers": 6, "mlp_ratio": 4.0, "temperature": 0.9, "top_k": 40, "top_p": 0.9, "repetition_penalty": 1.1, "presence_penalty": 0.6, "frequency_penalty": 0.0, "resid_dropout": 0.1, "dropout": 0.0, "grad_checkpoint": False, "tokenizer_path": "beeper.tokenizer.json" } device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Global model and tokenizer variables infer = None tok = None current_version = None def load_model_version(version_name): """Load the selected model version""" global infer, tok, current_version if current_version == version_name and infer is not None: return f"Already loaded: {version_name}" version_info = MODEL_VERSIONS[version_name] try: # Download model and tokenizer files model_file = hf_hub_download( repo_id=version_info["repo_id"], filename=version_info["model_file"] ) tokenizer_file = hf_hub_download( repo_id=version_info["repo_id"], filename="tokenizer.json" ) # Initialize model infer = BeeperRoseGPT(config).to(device) # Load safetensors state_dict = load_safetensors(model_file, device=str(device)) infer.load_state_dict(state_dict) infer.eval() # Load tokenizer tok = Tokenizer.from_file(tokenizer_file) current_version = version_name return f"Successfully loaded: {version_name}" except Exception as e: return f"Error loading {version_name}: {str(e)}" # Load default model on startup load_status = load_model_version("Beeper v1 (Original)") print(load_status) # ---------------------------- # 💬 Gradio Chat Wrapper # ---------------------------- def beeper_reply(message, history, model_version, temperature=None, top_k=None, top_p=None): global infer, tok, current_version # Load model if version changed if model_version != current_version: status = load_model_version(model_version) if "Error" in status: return f"⚠️ {status}" # Check if model is loaded if infer is None or tok is None: return "⚠️ Model not loaded. Please select a version and try again." # Use defaults if not provided if temperature is None: temperature = 0.9 if top_k is None: top_k = 40 if top_p is None: top_p = 0.9 # Try Q&A format since she has some in corpus if "?" in message: prompt = f"Q: {message}\nA:" elif message.lower().strip() in ["hi", "hello", "hey"]: prompt = "The little robot said hello. She said, \"" elif "story" in message.lower(): prompt = "Once upon a time, there was a robot. " else: # Simple continuation prompt = message + ". " # Generate response with lower temperature for less repetition response = generate( model=infer, tok=tok, cfg=config, prompt=prompt, max_new_tokens=80, # Shorter to avoid rambling temperature=float(temperature) * 0.8, # Slightly lower temp top_k=int(top_k), top_p=float(top_p), repetition_penalty=1.2, # Higher penalty for repetition presence_penalty=0.8, # Higher presence penalty frequency_penalty=0.1, # Add frequency penalty device=device, detokenize=True ) # Aggressive cleanup # Remove the prompt completely if response.startswith(prompt): response = response[len(prompt):] # Remove Q&A format artifacts response = response.replace("Q:", "").replace("A:", "") # Split on newlines and take first non-empty line lines = response.split('\n') for line in lines: clean_line = line.strip() if clean_line and not clean_line.startswith(message[:10]): response = clean_line break # If response still contains the user message, try to extract after it if message.lower()[:20] in response.lower()[:50]: # Find where the echo ends words_in_message = message.split() for i in range(min(5, len(words_in_message)), 0, -1): pattern = ' '.join(words_in_message[:i]) if pattern.lower() in response.lower(): idx = response.lower().find(pattern.lower()) + len(pattern) response = response[idx:].strip() break # Remove any remaining "User" or "Beeper" artifacts for artifact in ["User:", "Beeper:", "U ser:", "Beep er:", "User ", "Beeper "]: response = response.replace(artifact, "") # Ensure we have something if not response or len(response) < 3: responses = [ "I like robots and stories!", "That's interesting!", "I want to play in the park.", "The robot was happy.", "Yes, I think so too!" ] import random response = random.choice(responses) # Clean ending response = response.strip() if response and response[-1] not in '.!?"': response = response.rsplit('.', 1)[0] + '.' if '.' in response else response + '.' return response[:200] # Cap length # ---------------------------- # 🖼️ Interface # ---------------------------- with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🤖 Beeper - A Rose-based Tiny Language Model Hello! I'm Beeper, a small language model trained with love and care. Please be patient with me - I'm still learning! 💕 """ ) with gr.Row(): with gr.Column(scale=3): model_dropdown = gr.Dropdown( choices=list(MODEL_VERSIONS.keys()), value="Beeper v3 (Multi-Concept)", label="Select Beeper Version", info="Choose which version of Beeper to chat with" ) with gr.Column(scale=7): version_info = gr.Markdown("**Current:** Beeper v3 with 30+ epochs including reasoning, math, coding, and more.") # Update version info when dropdown changes def update_version_info(version_name): info = MODEL_VERSIONS[version_name]["description"] return f"**Current:** {info}" model_dropdown.change( fn=update_version_info, inputs=[model_dropdown], outputs=[version_info] ) # Chat interface chatbot = gr.Chatbot(label="Chat with Beeper", type="tuples", height=400) msg = gr.Textbox(label="Message", placeholder="Type your message here... She will probably complete it for now, but maybe she'll answer.") with gr.Row(): with gr.Column(scale=2): temperature_slider = gr.Slider(0.1, 1.5, value=0.9, step=0.1, label="Temperature") with gr.Column(scale=2): top_k_slider = gr.Slider(1, 100, value=40, step=1, label="Top-k") with gr.Column(scale=2): top_p_slider = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") with gr.Row(): submit = gr.Button("Send", variant="primary") clear = gr.Button("Clear") # Examples gr.Examples( examples=[ ["Hello Beeper! How are you today?"], ["Can you tell me a story about a robot?"], ["What do you like to do for fun?"], ["What makes you happy?"], ["Tell me about your dreams"], ], inputs=msg ) # Handle chat def respond(message, chat_history, model_version, temperature, top_k, top_p): if not chat_history: chat_history = [] response = beeper_reply(message, chat_history, model_version, temperature, top_k, top_p) chat_history.append([message, response]) return "", chat_history msg.submit( respond, [msg, chatbot, model_dropdown, temperature_slider, top_k_slider, top_p_slider], [msg, chatbot] ) submit.click( respond, [msg, chatbot, model_dropdown, temperature_slider, top_k_slider, top_p_slider], [msg, chatbot] ) clear.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": demo.launch()