Spaces:
Running
on
Zero
Running
on
Zero
""" | |
Rose Beeper Model - Inference Example | |
Simple script showing how to load and use the model for text generation | |
""" | |
import torch | |
from tokenizers import Tokenizer | |
from huggingface_hub import hf_hub_download | |
# Import the extracted components (assuming they're in a module called 'beeper_inference') | |
# from beeper_inference import BeeperRoseGPT, BeeperIO, generate, get_default_config | |
def load_model_for_inference( | |
checkpoint_path: str = None, | |
tokenizer_path: str = "beeper.tokenizer.json", | |
hf_repo: str = "AbstractPhil/beeper-rose-v5", | |
device: str = "cuda" | |
): | |
""" | |
Load the Rose Beeper model for inference. | |
Args: | |
checkpoint_path: Path to local checkpoint file (.pt or .safetensors) | |
tokenizer_path: Path to tokenizer file | |
hf_repo: HuggingFace repository to download from if no local checkpoint | |
device: Device to load model on ("cuda" or "cpu") | |
Returns: | |
Tuple of (model, tokenizer, config) | |
""" | |
# Get default configuration | |
config = get_default_config() | |
# Set device | |
device = torch.device(device if torch.cuda.is_available() else "cpu") | |
# Initialize model | |
model = BeeperRoseGPT(config).to(device) | |
# Initialize pentachora banks | |
# These are the default sizes from the training configuration | |
cap_cfg = config.get("capoera", {}) | |
coarse_C = 20 # Approximate number of alive datasets | |
model.ensure_pentachora( | |
coarse_C=coarse_C, | |
medium_C=int(cap_cfg.get("topic_bins", 512)), | |
fine_C=int(cap_cfg.get("mood_bins", 7)), | |
dim=config["dim"], | |
device=device | |
) | |
# Load checkpoint | |
loaded = False | |
# Try loading from local path | |
if checkpoint_path and os.path.exists(checkpoint_path): | |
print(f"Loading model from: {checkpoint_path}") | |
missing, unexpected = BeeperIO.load_into_model( | |
model, checkpoint_path, map_location="cpu", strict=False | |
) | |
print(f"Loaded | missing={len(missing)} unexpected={len(unexpected)}") | |
loaded = True | |
# Try downloading from HuggingFace | |
if not loaded and hf_repo: | |
try: | |
print(f"Downloading model from HuggingFace: {hf_repo}") | |
path = hf_hub_download(repo_id=hf_repo, filename="beeper_final.safetensors") | |
missing, unexpected = BeeperIO.load_into_model( | |
model, path, map_location="cpu", strict=False | |
) | |
print(f"Loaded | missing={len(missing)} unexpected={len(unexpected)}") | |
loaded = True | |
except Exception as e: | |
print(f"Failed to download from HuggingFace: {e}") | |
if not loaded: | |
print("WARNING: No weights loaded, using random initialization!") | |
# Load tokenizer | |
if os.path.exists(tokenizer_path): | |
tok = Tokenizer.from_file(tokenizer_path) | |
print(f"Loaded tokenizer from: {tokenizer_path}") | |
else: | |
# Try downloading tokenizer from HF | |
try: | |
tok_path = hf_hub_download(repo_id=hf_repo, filename="tokenizer.json") | |
tok = Tokenizer.from_file(tok_path) | |
print(f"Downloaded tokenizer from HuggingFace") | |
except Exception as e: | |
raise RuntimeError(f"Could not load tokenizer: {e}") | |
# Set model to eval mode | |
model.eval() | |
return model, tok, config | |
def interactive_generation(model, tokenizer, config, device="cuda"): | |
""" | |
Interactive text generation loop. | |
Args: | |
model: The loaded BeeperRoseGPT model | |
tokenizer: The tokenizer | |
config: Model configuration | |
device: Device to run on | |
""" | |
device = torch.device(device if torch.cuda.is_available() else "cpu") | |
model = model.to(device) | |
print("\n=== Rose Beeper Interactive Generation ===") | |
print("Enter your prompt (or 'quit' to exit)") | |
print("Commands: /temp <value>, /top_k <value>, /top_p <value>, /max <tokens>") | |
print("-" * 50) | |
# Generation settings (can be modified) | |
settings = { | |
"max_new_tokens": 100, | |
"temperature": config["temperature"], | |
"top_k": config["top_k"], | |
"top_p": config["top_p"], | |
"repetition_penalty": config["repetition_penalty"], | |
"presence_penalty": config["presence_penalty"], | |
"frequency_penalty": config["frequency_penalty"], | |
} | |
while True: | |
prompt = input("\nPrompt: ").strip() | |
if prompt.lower() == 'quit': | |
break | |
# Handle commands | |
if prompt.startswith('/'): | |
parts = prompt.split() | |
cmd = parts[0].lower() | |
if cmd == '/temp' and len(parts) > 1: | |
settings["temperature"] = float(parts[1]) | |
print(f"Temperature set to {settings['temperature']}") | |
continue | |
elif cmd == '/top_k' and len(parts) > 1: | |
settings["top_k"] = int(parts[1]) | |
print(f"Top-k set to {settings['top_k']}") | |
continue | |
elif cmd == '/top_p' and len(parts) > 1: | |
settings["top_p"] = float(parts[1]) | |
print(f"Top-p set to {settings['top_p']}") | |
continue | |
elif cmd == '/max' and len(parts) > 1: | |
settings["max_new_tokens"] = int(parts[1]) | |
print(f"Max tokens set to {settings['max_new_tokens']}") | |
continue | |
else: | |
print("Unknown command") | |
continue | |
if not prompt: | |
continue | |
# Generate text | |
print("\nGenerating...") | |
output = generate( | |
model=model, | |
tok=tokenizer, | |
cfg=config, | |
prompt=prompt, | |
device=device, | |
**settings | |
) | |
print("\nOutput:", output) | |
print("-" * 50) | |
def batch_generation_example(model, tokenizer, config, device="cuda"): | |
""" | |
Example of batch generation with different settings. | |
""" | |
device = torch.device(device if torch.cuda.is_available() else "cpu") | |
model = model.to(device) | |
prompts = [ | |
"The robot went to school and", | |
"Once upon a time in a magical forest", | |
"The scientist discovered that", | |
"In the year 2050, humanity", | |
"The philosophy of mind suggests", | |
] | |
print("\n=== Batch Generation Examples ===\n") | |
for prompt in prompts: | |
print(f"Prompt: {prompt}") | |
# Generate with different temperatures | |
for temp in [0.5, 0.9, 1.2]: | |
output = generate( | |
model=model, | |
tok=tokenizer, | |
cfg=config, | |
prompt=prompt, | |
max_new_tokens=50, | |
temperature=temp, | |
device=device | |
) | |
print(f" Temp {temp}: {output}") | |
print("-" * 50) | |
# Main execution example | |
if __name__ == "__main__": | |
import os | |
# Load model | |
model, tokenizer, config = load_model_for_inference( | |
checkpoint_path=None, # Will download from HF | |
hf_repo="AbstractPhil/beeper-rose-v5", | |
device="cuda" | |
) | |
# Example: Single generation | |
print("\n=== Single Generation Example ===") | |
output = generate( | |
model=model, | |
tok=tokenizer, | |
cfg=config, | |
prompt="The meaning of life is", | |
max_new_tokens=100, | |
temperature=0.9, | |
device="cuda" | |
) | |
print(f"Output: {output}") | |
# Example: Batch generation with different settings | |
# batch_generation_example(model, tokenizer, config) | |
# Example: Interactive generation | |
# interactive_generation(model, tokenizer, config) |