File size: 3,320 Bytes
7a6c881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python3
"""
Zero GPU Monitor - Keeps the Space alive and monitors health
"""
import os
import time
import requests
import sys
from datetime import datetime

# Configuration
SPACE_URL = os.environ.get('SPACE_URL', 'http://localhost:7860')
CHECK_INTERVAL = 180  # 3 minutes
HEALTH_ENDPOINT = '/health'
MAX_FAILURES = 3

def check_space_health():
    """Check if the Space is responding"""
    try:
        response = requests.get(SPACE_URL, timeout=10)
        return response.status_code == 200
    except Exception as e:
        print(f"Health check failed: {e}")
        return False

def keep_space_warm():
    """Send a dummy request to keep the Space warm"""
    try:
        # Send a simple request to the API
        payload = {
            "fn_index": 0,
            "data": ["MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK", 1, 10, "Small", False, 10]
        }
        response = requests.post(f"{SPACE_URL}/api/predict", json=payload, timeout=30)
        return response.status_code in [200, 202]
    except Exception as e:
        print(f"Keep-warm request failed: {e}")
        return False

def monitor_loop():
    """Main monitoring loop"""
    print(f"Starting Zero GPU Space monitor...")
    print(f"Space URL: {SPACE_URL}")
    print(f"Check interval: {CHECK_INTERVAL} seconds")
    
    consecutive_failures = 0
    last_warm_up = datetime.now()
    
    while True:
        try:
            current_time = datetime.now()
            print(f"\n[{current_time}] Performing health check...")
            
            # Check if Space is healthy
            if check_space_health():
                print("✓ Space is healthy")
                consecutive_failures = 0
                
                # Send keep-warm request every check
                time_since_warmup = (current_time - last_warm_up).total_seconds()
                if time_since_warmup > CHECK_INTERVAL:
                    print("Sending keep-warm request...")
                    if keep_space_warm():
                        print("✓ Keep-warm successful")
                        last_warm_up = current_time
                    else:
                        print("✗ Keep-warm failed")
            else:
                consecutive_failures += 1
                print(f"✗ Space is not responding (failure {consecutive_failures}/{MAX_FAILURES})")
                
                if consecutive_failures >= MAX_FAILURES:
                    print("ERROR: Space appears to be down!")
                    print("Please restart the Space from Hugging Face interface")
                    # Could add notification logic here
            
            # Wait before next check
            time.sleep(CHECK_INTERVAL)
            
        except KeyboardInterrupt:
            print("\nMonitor stopped by user")
            break
        except Exception as e:
            print(f"Monitor error: {e}")
            time.sleep(60)  # Wait a minute before retrying

if __name__ == "__main__":
    # Get Space URL from environment or command line
    if len(sys.argv) > 1:
        SPACE_URL = sys.argv[1]
    
    monitor_loop()