#!/usr/bin/env python3 """ Zero GPU Monitor - Keeps the Space alive and monitors health """ import os import time import requests import sys from datetime import datetime # Configuration SPACE_URL = os.environ.get('SPACE_URL', 'http://localhost:7860') CHECK_INTERVAL = 180 # 3 minutes HEALTH_ENDPOINT = '/health' MAX_FAILURES = 3 def check_space_health(): """Check if the Space is responding""" try: response = requests.get(SPACE_URL, timeout=10) return response.status_code == 200 except Exception as e: print(f"Health check failed: {e}") return False def keep_space_warm(): """Send a dummy request to keep the Space warm""" try: # Send a simple request to the API payload = { "fn_index": 0, "data": ["MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK", 1, 10, "Small", False, 10] } response = requests.post(f"{SPACE_URL}/api/predict", json=payload, timeout=30) return response.status_code in [200, 202] except Exception as e: print(f"Keep-warm request failed: {e}") return False def monitor_loop(): """Main monitoring loop""" print(f"Starting Zero GPU Space monitor...") print(f"Space URL: {SPACE_URL}") print(f"Check interval: {CHECK_INTERVAL} seconds") consecutive_failures = 0 last_warm_up = datetime.now() while True: try: current_time = datetime.now() print(f"\n[{current_time}] Performing health check...") # Check if Space is healthy if check_space_health(): print("✓ Space is healthy") consecutive_failures = 0 # Send keep-warm request every check time_since_warmup = (current_time - last_warm_up).total_seconds() if time_since_warmup > CHECK_INTERVAL: print("Sending keep-warm request...") if keep_space_warm(): print("✓ Keep-warm successful") last_warm_up = current_time else: print("✗ Keep-warm failed") else: consecutive_failures += 1 print(f"✗ Space is not responding (failure {consecutive_failures}/{MAX_FAILURES})") if consecutive_failures >= MAX_FAILURES: print("ERROR: Space appears to be down!") print("Please restart the Space from Hugging Face interface") # Could add notification logic here # Wait before next check time.sleep(CHECK_INTERVAL) except KeyboardInterrupt: print("\nMonitor stopped by user") break except Exception as e: print(f"Monitor error: {e}") time.sleep(60) # Wait a minute before retrying if __name__ == "__main__": # Get Space URL from environment or command line if len(sys.argv) > 1: SPACE_URL = sys.argv[1] monitor_loop()