Spaces:
Running
on
Zero
Running
on
Zero
#!/usr/bin/env python3 | |
""" | |
Zero GPU Monitor - Keeps the Space alive and monitors health | |
""" | |
import os | |
import time | |
import requests | |
import sys | |
from datetime import datetime | |
# Configuration | |
SPACE_URL = os.environ.get('SPACE_URL', 'http://localhost:7860') | |
CHECK_INTERVAL = 180 # 3 minutes | |
HEALTH_ENDPOINT = '/health' | |
MAX_FAILURES = 3 | |
def check_space_health(): | |
"""Check if the Space is responding""" | |
try: | |
response = requests.get(SPACE_URL, timeout=10) | |
return response.status_code == 200 | |
except Exception as e: | |
print(f"Health check failed: {e}") | |
return False | |
def keep_space_warm(): | |
"""Send a dummy request to keep the Space warm""" | |
try: | |
# Send a simple request to the API | |
payload = { | |
"fn_index": 0, | |
"data": ["MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK", 1, 10, "Small", False, 10] | |
} | |
response = requests.post(f"{SPACE_URL}/api/predict", json=payload, timeout=30) | |
return response.status_code in [200, 202] | |
except Exception as e: | |
print(f"Keep-warm request failed: {e}") | |
return False | |
def monitor_loop(): | |
"""Main monitoring loop""" | |
print(f"Starting Zero GPU Space monitor...") | |
print(f"Space URL: {SPACE_URL}") | |
print(f"Check interval: {CHECK_INTERVAL} seconds") | |
consecutive_failures = 0 | |
last_warm_up = datetime.now() | |
while True: | |
try: | |
current_time = datetime.now() | |
print(f"\n[{current_time}] Performing health check...") | |
# Check if Space is healthy | |
if check_space_health(): | |
print("β Space is healthy") | |
consecutive_failures = 0 | |
# Send keep-warm request every check | |
time_since_warmup = (current_time - last_warm_up).total_seconds() | |
if time_since_warmup > CHECK_INTERVAL: | |
print("Sending keep-warm request...") | |
if keep_space_warm(): | |
print("β Keep-warm successful") | |
last_warm_up = current_time | |
else: | |
print("β Keep-warm failed") | |
else: | |
consecutive_failures += 1 | |
print(f"β Space is not responding (failure {consecutive_failures}/{MAX_FAILURES})") | |
if consecutive_failures >= MAX_FAILURES: | |
print("ERROR: Space appears to be down!") | |
print("Please restart the Space from Hugging Face interface") | |
# Could add notification logic here | |
# Wait before next check | |
time.sleep(CHECK_INTERVAL) | |
except KeyboardInterrupt: | |
print("\nMonitor stopped by user") | |
break | |
except Exception as e: | |
print(f"Monitor error: {e}") | |
time.sleep(60) # Wait a minute before retrying | |
if __name__ == "__main__": | |
# Get Space URL from environment or command line | |
if len(sys.argv) > 1: | |
SPACE_URL = sys.argv[1] | |
monitor_loop() |