# app.py # Gradio app exposing full Corpus (coarse) and Capoera (topic/mood) selections import os, gc import json import gradio as gr import torch import spaces # NEW: for ZeroGPU from tokenizers import Tokenizer from huggingface_hub import hf_hub_download from safetensors.torch import load_file as load_safetensors from beeper_model import BeeperRoseGPT, generate, prepare_model_for_state_dict MODEL_VERSIONS = { "Beeper v4 (Advanced)": { "repo_id": "AbstractPhil/beeper-rose-v4", "model_file": "beeper_final.safetensors", "description": "Beeper v4 with nearly 40% the full corpus training - the most capable version currently." }, "Beeper v3 (Multi-Concept)": { "repo_id": "AbstractPhil/beeper-rose-v3", "model_file": "beeper_final.safetensors", "description": "Beeper v3 with 30+ epochs including reasoning, math, and ethics" }, "Beeper v2 (Extended)": { "repo_id": "AbstractPhil/beeper-rose-v2", "model_file": "beeper_final.safetensors", "description": "Beeper v2 with extended training (~15 epochs)" }, "Beeper v1 (Original)": { "repo_id": "AbstractPhil/beeper-rose-tinystories-6l-512d-ctx512", "model_file": "beeper_rose.safetensors", "description": "Original Beeper trained on TinyStories" }, } CONFIG = { "context": 512, "vocab_size": 8192, "dim": 512, "n_heads": 8, "n_layers": 6, "mlp_ratio": 4.0, "temperature": 0.9, "top_k": 40, "top_p": 0.9, "repetition_penalty": 1.10, "presence_penalty": 0.6, "frequency_penalty": 0.0, "resid_dropout": 0.1, "dropout": 0.0, "grad_checkpoint": False, "runtime_pentachora": { "enable": True, "pool": "mean", "temp": 0.10, "coarse_alpha": 0.25, "topic_alpha": 0.15, "mood_alpha": 0.10, }, } # no global device pinning — keep model on CPU until ZeroGPU allocates GPU infer: BeeperRoseGPT | None = None tok: Tokenizer | None = None current_version: str | None = None # Metadata for selectors CORPUS_CHOICES: list[str] = [] CORPUS_INDEX: dict[str, int] = {} TOPIC_CHOICES: list[str] = [] MOOD_CHOICES: list[str] = [] def _mood_labels(mood_bins: int) -> list[str]: center = mood_bins // 2 labels = [] for i in range(mood_bins): v = i - center name = { -3:"Very Negative", -2:"Negative", -1:"Slightly Negative", 0:"Neutral", 1:"Slightly Positive", 2:"Positive", 3:"Very Positive" }.get(v, f"Valence {v:+d}") labels.append(f"{i} ({name} {v:+d})") return labels def _build_choices_from_config(repo_id: str, coarse_C: int, topic_C: int, mood_C: int): global CORPUS_CHOICES, CORPUS_INDEX, TOPIC_CHOICES, MOOD_CHOICES CORPUS_CHOICES, CORPUS_INDEX = [], {} names = [] try: cfg_path = hf_hub_download(repo_id, "config.json") with open(cfg_path, "r", encoding="utf-8") as f: train_cfg = json.load(f) alive = train_cfg.get("_alive_entries") if isinstance(alive, list) and all(isinstance(e, dict) for e in alive): names = [str(e.get("name", f"Class {i}")) for i, e in enumerate(alive)] elif isinstance(train_cfg.get("corpus"), list): maybe = [str(e.get("name", f"Class {i}")) for i, e in enumerate(train_cfg["corpus"])] if len(maybe) == coarse_C: names = maybe except Exception: names = [] if len(names) != coarse_C: names = [f"Class {i}" for i in range(coarse_C)] CORPUS_CHOICES = names CORPUS_INDEX = {name: i for i, name in enumerate(names)} TOPIC_CHOICES = [str(i) for i in range(topic_C)] MOOD_CHOICES = _mood_labels(mood_C) def load_model_version(version_name: str) -> str: global infer, tok, current_version, CORPUS_CHOICES, TOPIC_CHOICES, MOOD_CHOICES if current_version == version_name and infer is not None and tok is not None: return f"Already loaded: {version_name}" info = MODEL_VERSIONS[version_name] try: model_file = hf_hub_download(info["repo_id"], info["model_file"]) tokenizer_file = hf_hub_download(info["repo_id"], "tokenizer.json") state = load_safetensors(model_file, device="cpu") m = BeeperRoseGPT(CONFIG) # keep on CPU prepare_model_for_state_dict(m, state, device="cpu") try: missing, unexpected = m.load_state_dict(state, strict=True) _msg = f"strict load ok | missing={len(missing)} unexpected={len(unexpected)}" except Exception as e: _msg = f"strict load failed ({e}); non-strict fallback" m.load_state_dict(state, strict=False) m.eval() t = Tokenizer.from_file(tokenizer_file) infer, tok, current_version = m, t, version_name coarse_C = infer.penta_coarse.size(0) if infer.penta_coarse is not None else 0 topic_C = infer.penta_medium.size(0) if infer.penta_medium is not None else 512 mood_C = infer.penta_fine.size(0) if infer.penta_fine is not None else 7 _build_choices_from_config(info["repo_id"], coarse_C, topic_C, mood_C) return f"Successfully loaded: {version_name} ({_msg})" except Exception as e: infer = None; tok = None; current_version = None CORPUS_CHOICES, TOPIC_CHOICES, MOOD_CHOICES = [], [], [] return f"Error loading {version_name}: {str(e)}" # Initial load: prefer v4, fallback to v3 try: status = load_model_version("Beeper v4 (Advanced)") if "Error" in status: print(status) status = load_model_version("Beeper v3 (Multi-Concept)") except Exception: status = load_model_version("Beeper v3 (Multi-Concept)") print(status) def _parse_selected_indices(values: list[str] | None, mapping: dict[str,int] | None = None) -> list[int] | None: if not values: return None if mapping is None: return [int(v.split()[0]) if isinstance(v, str) else int(v) for v in values] return [mapping[v] for v in values if v in mapping] @spaces.GPU() def beeper_infer(prompt: str, runtime_cfg: dict) -> str: """ZeroGPU: allocate GPU only here, move model to GPU for inference.""" global infer, tok dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") if dev.type == "cuda" and next(infer.parameters()).device.type != "cuda": infer.to(dev) torch.cuda.empty_cache() try: out = generate( model=infer, tok=tok, cfg=CONFIG, prompt=prompt, max_new_tokens=int(runtime_cfg.pop("_max_new_tokens")), temperature=float(runtime_cfg.pop("_temperature")) if runtime_cfg.get("_temperature") is not None else None, top_k=int(runtime_cfg.pop("_top_k")) if runtime_cfg.get("_top_k") is not None else None, top_p=float(runtime_cfg.pop("_top_p")) if runtime_cfg.get("_top_p") is not None else None, repetition_penalty=1.10, presence_penalty=0.8, frequency_penalty=0.1, device=dev, detokenize=True, runtime_cfg=runtime_cfg, ) return out finally: if dev.type == "cuda": infer.to("cpu") torch.cuda.empty_cache() gc.collect() def beeper_reply(message, history, model_version, temperature, top_k, top_p, max_new_tokens, corpus_selected, topic_selected, mood_selected): global infer, tok, current_version if model_version != current_version: s = load_model_version(model_version) if "Error" in s: return f"⚠️ {s}" if infer is None or tok is None: return "⚠️ Model not loaded. Please select a version and try again." rt = dict(CONFIG.get("runtime_pentachora", {})) rt["coarse_select"] = _parse_selected_indices(corpus_selected, CORPUS_INDEX) rt["topic_select"] = _parse_selected_indices(topic_selected, None) rt["mood_select"] = _parse_selected_indices(mood_selected, None) rt["_temperature"] = temperature rt["_top_k"] = top_k rt["_top_p"] = top_p rt["_max_new_tokens"]= max_new_tokens m = (message or "").strip() if "?" in m: prompt = f"Q: {m}\nA:" elif m.lower() in {"hi","hello","hey"}: prompt = 'The little robot said hello. She said, "' elif "story" in m.lower(): prompt = "Once upon a time, there was a robot. " else: prompt = m + ". " out = beeper_infer(prompt, rt) if out.startswith(prompt): out = out[len(prompt):] out = out.replace("Q:","").replace("A:","").strip() if out and out[-1] not in ".!?”\"'": out += "." return out[:200] # ---------------- UI ---------------- with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🤖 Beeper — Corpus & Capoera–aware Chat") with gr.Row(): with gr.Column(scale=3): model_dropdown = gr.Dropdown( choices=list(MODEL_VERSIONS.keys()), value="Beeper v4 (Advanced)", label="Select Beeper Version" ) with gr.Column(scale=7): version_info = gr.Markdown("**Current:** " + MODEL_VERSIONS["Beeper v4 (Advanced)"]["description"]) with gr.Row(): with gr.Column(): corpus_select = gr.Dropdown(choices=CORPUS_CHOICES, multiselect=True, label="Corpus (Coarse classes)") with gr.Column(): topic_select = gr.Dropdown(choices=TOPIC_CHOICES, multiselect=True, label="Capoera Topics (IDs)") with gr.Column(): mood_select = gr.Dropdown(choices=MOOD_CHOICES, multiselect=True, label="Capoera Moods (valence)") chatbot = gr.Chatbot(label="Chat with Beeper", height=420) msg = gr.Textbox(label="Message", placeholder="Type your message here...") with gr.Row(): with gr.Column(scale=2): temperature_slider = gr.Slider(0.1, 1.5, value=0.9, step=0.1, label="Temperature") with gr.Column(scale=2): top_k_slider = gr.Slider(1, 100, value=40, step=1, label="Top-k") with gr.Column(scale=2): top_p_slider = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") with gr.Column(scale=2): max_new_tokens_slider = gr.Slider(20, 512, value=128, step=1, label="Max new tokens") with gr.Row(): submit = gr.Button("Send", variant="primary") clear = gr.Button("Clear") def on_change_version(version_name: str): status = load_model_version(version_name) info = f"**Current:** {MODEL_VERSIONS[version_name]['description']} \n{status}" return ( info, gr.update(choices=CORPUS_CHOICES, value=[]), gr.update(choices=TOPIC_CHOICES, value=[]), gr.update(choices=MOOD_CHOICES, value=[]), ) model_dropdown.change( on_change_version, inputs=[model_dropdown], outputs=[version_info, corpus_select, topic_select, mood_select], ) def respond(message, chat_history, model_version, temperature, top_k, top_p, max_new_tokens, corpus_selected, topic_selected, mood_selected): if chat_history is None: chat_history = [] resp = beeper_reply(message, chat_history, model_version, temperature, top_k, top_p, max_new_tokens, corpus_selected, topic_selected, mood_selected) chat_history.append((message, resp)) return "", chat_history inputs_all = [msg, chatbot, model_dropdown, temperature_slider, top_k_slider, top_p_slider, max_new_tokens_slider, corpus_select, topic_select, mood_select] outputs_all = [msg, chatbot] msg.submit(respond, inputs_all, outputs_all, concurrency_id="infer", concurrency_limit="default") submit.click(respond, inputs_all, outputs_all, concurrency_id="infer", concurrency_limit="default") clear.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": demo.queue( max_size=256, default_concurrency_limit=1, status_update_rate="auto", api_open=False, ).launch()