Spaces:
Runtime error
Runtime error
File size: 5,270 Bytes
c3d464b c74f490 c3d464b 8b76adf c3d464b c74f490 c3d464b c74f490 c3d464b c74f490 c3d464b c74f490 c3d464b fa82766 2ff6d31 fa82766 2ff6d31 c3d464b fa82766 c3d464b c74f490 c3d464b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
#!/bin/bash
# Multi-GPU Neural OS Startup Script
# Default values
NUM_GPUS=2
DISPATCHER_PORT=8000
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--num-gpus)
NUM_GPUS="$2"
shift 2
;;
--port)
DISPATCHER_PORT="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [--num-gpus N] [--port PORT]"
echo " --num-gpus N Number of GPU workers to start (default: 2)"
echo " --port PORT Dispatcher port (default: 8000)"
exit 0
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
# Function to cleanup background processes
cleanup() {
echo ""
echo "π Shutting down system..."
# Kill dispatcher
if [[ -n $DISPATCHER_PID ]]; then
echo "Stopping dispatcher (PID: $DISPATCHER_PID)..."
kill $DISPATCHER_PID 2>/dev/null
wait $DISPATCHER_PID 2>/dev/null
fi
# Kill workers by finding their processes
echo "Stopping workers..."
pkill -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
sleep 2
# Force kill if any are still running
pkill -9 -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
echo "β
System stopped"
exit 0
}
# Set up signal handlers
trap cleanup SIGINT SIGTERM
echo "π Starting Multi-GPU Neural OS System"
echo "========================================"
echo "π Number of GPUs: $NUM_GPUS"
echo "π Dispatcher port: $DISPATCHER_PORT"
echo "π» Worker ports: $(seq -s', ' 8001 $((8000 + NUM_GPUS)))"
echo "π Analytics logging: system_analytics_$(date +%Y%m%d_%H%M%S).log"
echo ""
# Check if required files exist
if [[ ! -f "dispatcher.py" ]]; then
echo "β Error: dispatcher.py not found"
exit 1
fi
if [[ ! -f "worker.py" ]]; then
echo "β Error: worker.py not found"
exit 1
fi
if [[ ! -f "start_workers.py" ]]; then
echo "β Error: start_workers.py not found"
exit 1
fi
# Start dispatcher
echo "π― Starting dispatcher..."
python dispatcher.py --port $DISPATCHER_PORT > dispatcher.log 2>&1 &
DISPATCHER_PID=$!
# Wait a bit for dispatcher to start
sleep 3
# Check if dispatcher started successfully
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
echo "β Failed to start dispatcher. Check dispatcher.log for errors."
exit 1
fi
echo "β
Dispatcher started (PID: $DISPATCHER_PID)"
# Start workers
echo "π§ Starting $NUM_GPUS GPU workers..."
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
WORKER_START_EXIT_CODE=$?
# Wait a bit for workers to register
sleep 3
# Check if workers started successfully by checking the exit code and log
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
echo "β Failed to start workers. Check workers.log for errors."
cleanup
exit 1
fi
# Check if workers are actually running by looking for their processes
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
echo "β Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
cleanup
exit 1
fi
echo "β
Workers started successfully ($RUNNING_WORKERS workers running)"
echo ""
echo "π System is ready!"
echo "================================"
echo "π Web interface: http://localhost:$DISPATCHER_PORT"
echo "π Dispatcher health: http://localhost:$DISPATCHER_PORT"
echo "π§ Worker health checks:"
for ((i=0; i<NUM_GPUS; i++)); do
echo " GPU $i: http://localhost:$((8001 + i))/health"
done
echo ""
echo "π Log files:"
echo " π Analytics (human-readable): system_analytics_*.log"
echo " π₯οΈ GPU metrics (JSON): gpu_metrics_*.jsonl"
echo " π Connection events (JSON): connection_events_*.jsonl"
echo " π Queue metrics (JSON): queue_metrics_*.jsonl"
echo " π IP statistics (JSON): ip_stats_*.jsonl"
echo " π― Dispatcher: dispatcher.log"
echo " π§ Workers summary: workers.log"
for ((i=0; i<NUM_GPUS; i++)); do
echo " π₯οΈ GPU $i worker: worker_gpu_$i.log"
done
echo ""
echo "π‘ Real-time monitoring:"
echo " Human-readable: tail -f system_analytics_*.log"
echo " GPU utilization: tail -f gpu_metrics_*.jsonl"
echo " Connection events: tail -f connection_events_*.jsonl"
echo ""
echo "π Data analysis:"
echo " Summary report: python analyze_analytics.py"
echo " Last 6 hours: python analyze_analytics.py --since 6"
echo " GPU analysis only: python analyze_analytics.py --type gpu"
echo "Press Ctrl+C to stop the system"
echo "================================"
# Keep the script running and wait for interrupt
while true; do
# Check if processes are still running
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
echo "β οΈ Dispatcher process died unexpectedly"
cleanup
exit 1
fi
# Check if workers are still running
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
echo "β οΈ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
cleanup
exit 1
fi
sleep 5
done |