Spaces:

AbstractPhil
/

meet-beeper

Running on Zero

File size: 7,759 Bytes

c84b8a9

"""
Rose Beeper Model - Inference Example
Simple script showing how to load and use the model for text generation
"""

import torch
from tokenizers import Tokenizer
from huggingface_hub import hf_hub_download

# Import the extracted components (assuming they're in a module called 'beeper_inference')
# from beeper_inference import BeeperRoseGPT, BeeperIO, generate, get_default_config

def load_model_for_inference(
    checkpoint_path: str = None,
    tokenizer_path: str = "beeper.tokenizer.json",
    hf_repo: str = "AbstractPhil/beeper-rose-v5",
    device: str = "cuda"
):
    """
    Load the Rose Beeper model for inference.
    
    Args:
        checkpoint_path: Path to local checkpoint file (.pt or .safetensors)
        tokenizer_path: Path to tokenizer file
        hf_repo: HuggingFace repository to download from if no local checkpoint
        device: Device to load model on ("cuda" or "cpu")
    
    Returns:
        Tuple of (model, tokenizer, config)
    """
    # Get default configuration
    config = get_default_config()
    
    # Set device
    device = torch.device(device if torch.cuda.is_available() else "cpu")
    
    # Initialize model
    model = BeeperRoseGPT(config).to(device)
    
    # Initialize pentachora banks
    # These are the default sizes from the training configuration
    cap_cfg = config.get("capoera", {})
    coarse_C = 20  # Approximate number of alive datasets
    model.ensure_pentachora(
        coarse_C=coarse_C,
        medium_C=int(cap_cfg.get("topic_bins", 512)),
        fine_C=int(cap_cfg.get("mood_bins", 7)),
        dim=config["dim"],
        device=device
    )
    
    # Load checkpoint
    loaded = False
    
    # Try loading from local path
    if checkpoint_path and os.path.exists(checkpoint_path):
        print(f"Loading model from: {checkpoint_path}")
        missing, unexpected = BeeperIO.load_into_model(
            model, checkpoint_path, map_location="cpu", strict=False
        )
        print(f"Loaded | missing={len(missing)} unexpected={len(unexpected)}")
        loaded = True
    
    # Try downloading from HuggingFace
    if not loaded and hf_repo:
        try:
            print(f"Downloading model from HuggingFace: {hf_repo}")
            path = hf_hub_download(repo_id=hf_repo, filename="beeper_final.safetensors")
            missing, unexpected = BeeperIO.load_into_model(
                model, path, map_location="cpu", strict=False
            )
            print(f"Loaded | missing={len(missing)} unexpected={len(unexpected)}")
            loaded = True
        except Exception as e:
            print(f"Failed to download from HuggingFace: {e}")
    
    if not loaded:
        print("WARNING: No weights loaded, using random initialization!")
    
    # Load tokenizer
    if os.path.exists(tokenizer_path):
        tok = Tokenizer.from_file(tokenizer_path)
        print(f"Loaded tokenizer from: {tokenizer_path}")
    else:
        # Try downloading tokenizer from HF
        try:
            tok_path = hf_hub_download(repo_id=hf_repo, filename="tokenizer.json")
            tok = Tokenizer.from_file(tok_path)
            print(f"Downloaded tokenizer from HuggingFace")
        except Exception as e:
            raise RuntimeError(f"Could not load tokenizer: {e}")
    
    # Set model to eval mode
    model.eval()
    
    return model, tok, config


def interactive_generation(model, tokenizer, config, device="cuda"):
    """
    Interactive text generation loop.
    
    Args:
        model: The loaded BeeperRoseGPT model
        tokenizer: The tokenizer
        config: Model configuration
        device: Device to run on
    """
    device = torch.device(device if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    print("\n=== Rose Beeper Interactive Generation ===")
    print("Enter your prompt (or 'quit' to exit)")
    print("Commands: /temp <value>, /top_k <value>, /top_p <value>, /max <tokens>")
    print("-" * 50)
    
    # Generation settings (can be modified)
    settings = {
        "max_new_tokens": 100,
        "temperature": config["temperature"],
        "top_k": config["top_k"],
        "top_p": config["top_p"],
        "repetition_penalty": config["repetition_penalty"],
        "presence_penalty": config["presence_penalty"],
        "frequency_penalty": config["frequency_penalty"],
    }
    
    while True:
        prompt = input("\nPrompt: ").strip()
        
        if prompt.lower() == 'quit':
            break
        
        # Handle commands
        if prompt.startswith('/'):
            parts = prompt.split()
            cmd = parts[0].lower()
            
            if cmd == '/temp' and len(parts) > 1:
                settings["temperature"] = float(parts[1])
                print(f"Temperature set to {settings['temperature']}")
                continue
            elif cmd == '/top_k' and len(parts) > 1:
                settings["top_k"] = int(parts[1])
                print(f"Top-k set to {settings['top_k']}")
                continue
            elif cmd == '/top_p' and len(parts) > 1:
                settings["top_p"] = float(parts[1])
                print(f"Top-p set to {settings['top_p']}")
                continue
            elif cmd == '/max' and len(parts) > 1:
                settings["max_new_tokens"] = int(parts[1])
                print(f"Max tokens set to {settings['max_new_tokens']}")
                continue
            else:
                print("Unknown command")
                continue
        
        if not prompt:
            continue
        
        # Generate text
        print("\nGenerating...")
        output = generate(
            model=model,
            tok=tokenizer,
            cfg=config,
            prompt=prompt,
            device=device,
            **settings
        )
        
        print("\nOutput:", output)
        print("-" * 50)


def batch_generation_example(model, tokenizer, config, device="cuda"):
    """
    Example of batch generation with different settings.
    """
    device = torch.device(device if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    prompts = [
        "The robot went to school and",
        "Once upon a time in a magical forest",
        "The scientist discovered that",
        "In the year 2050, humanity",
        "The philosophy of mind suggests",
    ]
    
    print("\n=== Batch Generation Examples ===\n")
    
    for prompt in prompts:
        print(f"Prompt: {prompt}")
        
        # Generate with different temperatures
        for temp in [0.5, 0.9, 1.2]:
            output = generate(
                model=model,
                tok=tokenizer,
                cfg=config,
                prompt=prompt,
                max_new_tokens=50,
                temperature=temp,
                device=device
            )
            print(f"  Temp {temp}: {output}")
        
        print("-" * 50)


# Main execution example
if __name__ == "__main__":
    import os
    
    # Load model
    model, tokenizer, config = load_model_for_inference(
        checkpoint_path=None,  # Will download from HF
        hf_repo="AbstractPhil/beeper-rose-v5",
        device="cuda"
    )
    
    # Example: Single generation
    print("\n=== Single Generation Example ===")
    output = generate(
        model=model,
        tok=tokenizer,
        cfg=config,
        prompt="The meaning of life is",
        max_new_tokens=100,
        temperature=0.9,
        device="cuda"
    )
    print(f"Output: {output}")
    
    # Example: Batch generation with different settings
    # batch_generation_example(model, tokenizer, config)
    
    # Example: Interactive generation
    # interactive_generation(model, tokenizer, config)