Transeption_iGEM_BASISCHINA_2025 / zero_gpu_monitor.py
MoraxCheng's picture
Add keep-alive scripts and environment configuration for Tranception Space
7a6c881
#!/usr/bin/env python3
"""
Zero GPU Monitor - Keeps the Space alive and monitors health
"""
import os
import time
import requests
import sys
from datetime import datetime
# Configuration
SPACE_URL = os.environ.get('SPACE_URL', 'http://localhost:7860')
CHECK_INTERVAL = 180 # 3 minutes
HEALTH_ENDPOINT = '/health'
MAX_FAILURES = 3
def check_space_health():
"""Check if the Space is responding"""
try:
response = requests.get(SPACE_URL, timeout=10)
return response.status_code == 200
except Exception as e:
print(f"Health check failed: {e}")
return False
def keep_space_warm():
"""Send a dummy request to keep the Space warm"""
try:
# Send a simple request to the API
payload = {
"fn_index": 0,
"data": ["MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK", 1, 10, "Small", False, 10]
}
response = requests.post(f"{SPACE_URL}/api/predict", json=payload, timeout=30)
return response.status_code in [200, 202]
except Exception as e:
print(f"Keep-warm request failed: {e}")
return False
def monitor_loop():
"""Main monitoring loop"""
print(f"Starting Zero GPU Space monitor...")
print(f"Space URL: {SPACE_URL}")
print(f"Check interval: {CHECK_INTERVAL} seconds")
consecutive_failures = 0
last_warm_up = datetime.now()
while True:
try:
current_time = datetime.now()
print(f"\n[{current_time}] Performing health check...")
# Check if Space is healthy
if check_space_health():
print("βœ“ Space is healthy")
consecutive_failures = 0
# Send keep-warm request every check
time_since_warmup = (current_time - last_warm_up).total_seconds()
if time_since_warmup > CHECK_INTERVAL:
print("Sending keep-warm request...")
if keep_space_warm():
print("βœ“ Keep-warm successful")
last_warm_up = current_time
else:
print("βœ— Keep-warm failed")
else:
consecutive_failures += 1
print(f"βœ— Space is not responding (failure {consecutive_failures}/{MAX_FAILURES})")
if consecutive_failures >= MAX_FAILURES:
print("ERROR: Space appears to be down!")
print("Please restart the Space from Hugging Face interface")
# Could add notification logic here
# Wait before next check
time.sleep(CHECK_INTERVAL)
except KeyboardInterrupt:
print("\nMonitor stopped by user")
break
except Exception as e:
print(f"Monitor error: {e}")
time.sleep(60) # Wait a minute before retrying
if __name__ == "__main__":
# Get Space URL from environment or command line
if len(sys.argv) > 1:
SPACE_URL = sys.argv[1]
monitor_loop()