Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,320 Bytes
7a6c881 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
#!/usr/bin/env python3
"""
Zero GPU Monitor - Keeps the Space alive and monitors health
"""
import os
import time
import requests
import sys
from datetime import datetime
# Configuration
SPACE_URL = os.environ.get('SPACE_URL', 'http://localhost:7860')
CHECK_INTERVAL = 180 # 3 minutes
HEALTH_ENDPOINT = '/health'
MAX_FAILURES = 3
def check_space_health():
"""Check if the Space is responding"""
try:
response = requests.get(SPACE_URL, timeout=10)
return response.status_code == 200
except Exception as e:
print(f"Health check failed: {e}")
return False
def keep_space_warm():
"""Send a dummy request to keep the Space warm"""
try:
# Send a simple request to the API
payload = {
"fn_index": 0,
"data": ["MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK", 1, 10, "Small", False, 10]
}
response = requests.post(f"{SPACE_URL}/api/predict", json=payload, timeout=30)
return response.status_code in [200, 202]
except Exception as e:
print(f"Keep-warm request failed: {e}")
return False
def monitor_loop():
"""Main monitoring loop"""
print(f"Starting Zero GPU Space monitor...")
print(f"Space URL: {SPACE_URL}")
print(f"Check interval: {CHECK_INTERVAL} seconds")
consecutive_failures = 0
last_warm_up = datetime.now()
while True:
try:
current_time = datetime.now()
print(f"\n[{current_time}] Performing health check...")
# Check if Space is healthy
if check_space_health():
print("✓ Space is healthy")
consecutive_failures = 0
# Send keep-warm request every check
time_since_warmup = (current_time - last_warm_up).total_seconds()
if time_since_warmup > CHECK_INTERVAL:
print("Sending keep-warm request...")
if keep_space_warm():
print("✓ Keep-warm successful")
last_warm_up = current_time
else:
print("✗ Keep-warm failed")
else:
consecutive_failures += 1
print(f"✗ Space is not responding (failure {consecutive_failures}/{MAX_FAILURES})")
if consecutive_failures >= MAX_FAILURES:
print("ERROR: Space appears to be down!")
print("Please restart the Space from Hugging Face interface")
# Could add notification logic here
# Wait before next check
time.sleep(CHECK_INTERVAL)
except KeyboardInterrupt:
print("\nMonitor stopped by user")
break
except Exception as e:
print(f"Monitor error: {e}")
time.sleep(60) # Wait a minute before retrying
if __name__ == "__main__":
# Get Space URL from environment or command line
if len(sys.argv) > 1:
SPACE_URL = sys.argv[1]
monitor_loop() |