da03 commited on
Commit
c3d464b
Β·
1 Parent(s): fe31dcf
Files changed (1) hide show
  1. start_system.sh +147 -0
start_system.sh ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Multi-GPU Neural OS Startup Script
4
+
5
+ # Default values
6
+ NUM_GPUS=2
7
+ DISPATCHER_PORT=8000
8
+
9
+ # Parse command line arguments
10
+ while [[ $# -gt 0 ]]; do
11
+ case $1 in
12
+ --num-gpus)
13
+ NUM_GPUS="$2"
14
+ shift 2
15
+ ;;
16
+ --port)
17
+ DISPATCHER_PORT="$2"
18
+ shift 2
19
+ ;;
20
+ -h|--help)
21
+ echo "Usage: $0 [--num-gpus N] [--port PORT]"
22
+ echo " --num-gpus N Number of GPU workers to start (default: 2)"
23
+ echo " --port PORT Dispatcher port (default: 8000)"
24
+ exit 0
25
+ ;;
26
+ *)
27
+ echo "Unknown option: $1" >&2
28
+ exit 1
29
+ ;;
30
+ esac
31
+ done
32
+
33
+ # Function to cleanup background processes
34
+ cleanup() {
35
+ echo ""
36
+ echo "πŸ›‘ Shutting down system..."
37
+
38
+ # Kill dispatcher
39
+ if [[ -n $DISPATCHER_PID ]]; then
40
+ echo "Stopping dispatcher (PID: $DISPATCHER_PID)..."
41
+ kill $DISPATCHER_PID 2>/dev/null
42
+ wait $DISPATCHER_PID 2>/dev/null
43
+ fi
44
+
45
+ # Kill workers
46
+ if [[ -n $WORKERS_PID ]]; then
47
+ echo "Stopping workers (PID: $WORKERS_PID)..."
48
+ kill $WORKERS_PID 2>/dev/null
49
+ wait $WORKERS_PID 2>/dev/null
50
+ fi
51
+
52
+ echo "βœ… System stopped"
53
+ exit 0
54
+ }
55
+
56
+ # Set up signal handlers
57
+ trap cleanup SIGINT SIGTERM
58
+
59
+ echo "πŸš€ Starting Multi-GPU Neural OS System"
60
+ echo "========================================"
61
+ echo "πŸ“Š Number of GPUs: $NUM_GPUS"
62
+ echo "🌐 Dispatcher port: $DISPATCHER_PORT"
63
+ echo "πŸ’» Worker ports: $(seq -s', ' 8001 $((8000 + NUM_GPUS)))"
64
+ echo ""
65
+
66
+ # Check if required files exist
67
+ if [[ ! -f "dispatcher.py" ]]; then
68
+ echo "❌ Error: dispatcher.py not found"
69
+ exit 1
70
+ fi
71
+
72
+ if [[ ! -f "worker.py" ]]; then
73
+ echo "❌ Error: worker.py not found"
74
+ exit 1
75
+ fi
76
+
77
+ if [[ ! -f "start_workers.py" ]]; then
78
+ echo "❌ Error: start_workers.py not found"
79
+ exit 1
80
+ fi
81
+
82
+ # Start dispatcher
83
+ echo "🎯 Starting dispatcher..."
84
+ python dispatcher.py --port $DISPATCHER_PORT > dispatcher.log 2>&1 &
85
+ DISPATCHER_PID=$!
86
+
87
+ # Wait a bit for dispatcher to start
88
+ sleep 3
89
+
90
+ # Check if dispatcher started successfully
91
+ if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
92
+ echo "❌ Failed to start dispatcher. Check dispatcher.log for errors."
93
+ exit 1
94
+ fi
95
+
96
+ echo "βœ… Dispatcher started (PID: $DISPATCHER_PID)"
97
+
98
+ # Start workers
99
+ echo "πŸ”§ Starting $NUM_GPUS GPU workers..."
100
+ python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1 &
101
+ WORKERS_PID=$!
102
+
103
+ # Wait a bit for workers to start
104
+ sleep 5
105
+
106
+ # Check if workers started successfully
107
+ if ! kill -0 $WORKERS_PID 2>/dev/null; then
108
+ echo "❌ Failed to start workers. Check workers.log for errors."
109
+ cleanup
110
+ exit 1
111
+ fi
112
+
113
+ echo "βœ… Workers started (PID: $WORKERS_PID)"
114
+ echo ""
115
+ echo "πŸŽ‰ System is ready!"
116
+ echo "================================"
117
+ echo "🌍 Web interface: http://localhost:$DISPATCHER_PORT"
118
+ echo "πŸ“Š Dispatcher health: http://localhost:$DISPATCHER_PORT"
119
+ echo "πŸ”§ Worker health checks:"
120
+ for ((i=0; i<NUM_GPUS; i++)); do
121
+ echo " GPU $i: http://localhost:$((8001 + i))/health"
122
+ done
123
+ echo ""
124
+ echo "πŸ“‹ Log files:"
125
+ echo " Dispatcher: dispatcher.log"
126
+ echo " Workers: workers.log"
127
+ echo ""
128
+ echo "Press Ctrl+C to stop the system"
129
+ echo "================================"
130
+
131
+ # Keep the script running and wait for interrupt
132
+ while true; do
133
+ # Check if processes are still running
134
+ if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
135
+ echo "⚠️ Dispatcher process died unexpectedly"
136
+ cleanup
137
+ exit 1
138
+ fi
139
+
140
+ if ! kill -0 $WORKERS_PID 2>/dev/null; then
141
+ echo "⚠️ Workers process died unexpectedly"
142
+ cleanup
143
+ exit 1
144
+ fi
145
+
146
+ sleep 5
147
+ done