Spaces:
Runtime error
Runtime error
File size: 7,430 Bytes
c3d464b 3085e5d c3d464b 3085e5d 888f299 c3d464b 3085e5d 888f299 3085e5d c3d464b c74f490 c686855 c74f490 c686855 c3d464b 3085e5d c3d464b 8b76adf c3d464b 888f299 c3d464b 64a144d c74f490 c3d464b c686855 c3d464b c74f490 c3d464b c686855 c74f490 c3d464b fa82766 2ff6d31 fa82766 2ff6d31 c3d464b fa82766 c3d464b c74f490 c686855 c74f490 d4eaeb4 c3d464b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
#!/bin/bash
# Multi-GPU Neural OS Startup Script
# Function to detect number of GPUs automatically
detect_gpu_count() {
if command -v nvidia-smi >/dev/null 2>&1; then
# Use nvidia-smi to count GPUs
local gpu_count=$(nvidia-smi -L 2>/dev/null | wc -l)
if [ "$gpu_count" -gt 0 ]; then
echo "$gpu_count"
return 0
fi
fi
# If nvidia-smi fails, try alternative methods
if [ -d "/proc/driver/nvidia/gpus" ]; then
local gpu_count=$(ls -d /proc/driver/nvidia/gpus/*/information 2>/dev/null | wc -l)
if [ "$gpu_count" -gt 0 ]; then
echo "$gpu_count"
return 0
fi
fi
# Default fallback
echo "1"
return 1
}
# Detect GPU count automatically
DETECTED_GPUS=$(detect_gpu_count)
GPU_DETECTION_SUCCESS=$?
# Default values
NUM_GPUS=$DETECTED_GPUS
DISPATCHER_PORT=7860
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--num-gpus)
NUM_GPUS="$2"
shift 2
;;
--port)
DISPATCHER_PORT="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [--num-gpus N] [--port PORT]"
echo " --num-gpus N Number of GPU workers to start (default: auto-detected)"
echo " --port PORT Dispatcher port (default: 7860)"
echo ""
echo "GPU Detection:"
echo " Automatically detects available GPUs using nvidia-smi"
echo " Currently detected: $DETECTED_GPUS GPU(s)"
if [ $GPU_DETECTION_SUCCESS -ne 0 ]; then
echo " β οΈ GPU detection failed - using fallback of 1 GPU"
fi
exit 0
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
# Function to cleanup background processes
cleanup() {
echo ""
echo "π Shutting down system..."
# Kill dispatcher
if [[ -n $DISPATCHER_PID ]]; then
echo "Stopping dispatcher (PID: $DISPATCHER_PID)..."
kill $DISPATCHER_PID 2>/dev/null
wait $DISPATCHER_PID 2>/dev/null
fi
# Kill workers by finding their processes
echo "Stopping workers..."
pkill -f "python.*worker.py.*--worker-address" 2>/dev/null || true
sleep 2
# Force kill if any are still running
pkill -9 -f "python.*worker.py.*--worker-address" 2>/dev/null || true
echo "β
System stopped"
exit 0
}
# Set up signal handlers
trap cleanup SIGINT SIGTERM
echo "π Starting Multi-GPU Neural OS System"
echo "========================================"
echo "π GPU Detection: $DETECTED_GPUS GPU(s) detected"
if [ $GPU_DETECTION_SUCCESS -ne 0 ]; then
echo "β οΈ GPU detection failed - using fallback count"
elif command -v nvidia-smi >/dev/null 2>&1; then
echo "π Detected GPUs:"
nvidia-smi -L 2>/dev/null | sed 's/^/ /'
fi
echo "π Number of GPUs: $NUM_GPUS"
echo "π Dispatcher port: $DISPATCHER_PORT"
echo "π» Worker ports: $(seq -s', ' 8001 $((8000 + NUM_GPUS)))"
echo "π Analytics logging: system_analytics_$(date +%Y%m%d_%H%M%S).log"
echo ""
# Validate that we're not trying to start more workers than GPUs
if [ "$NUM_GPUS" -gt "$DETECTED_GPUS" ]; then
echo "β οΈ Warning: Trying to start $NUM_GPUS workers but only $DETECTED_GPUS GPU(s) detected"
echo " This may cause GPU sharing or errors. Consider using --num-gpus $DETECTED_GPUS"
echo ""
fi
# Check if required files exist
if [[ ! -f "dispatcher.py" ]]; then
echo "β Error: dispatcher.py not found"
exit 1
fi
if [[ ! -f "worker.py" ]]; then
echo "β Error: worker.py not found"
exit 1
fi
if [[ ! -f "start_workers.py" ]]; then
echo "β Error: start_workers.py not found"
exit 1
fi
# Start dispatcher
echo "π― Starting dispatcher..."
python dispatcher.py --port $DISPATCHER_PORT > dispatcher.log 2>&1 &
DISPATCHER_PID=$!
# Wait a bit for dispatcher to start
sleep 3
# Check if dispatcher started successfully
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
echo "β Failed to start dispatcher. Check dispatcher.log for errors."
exit 1
fi
echo "β
Dispatcher started (PID: $DISPATCHER_PID)"
# Start workers
echo "π§ Starting $NUM_GPUS GPU workers..."
python start_workers.py --num-gpus $NUM_GPUS --dispatcher-url "http://localhost:$DISPATCHER_PORT" --no-monitor > workers.log 2>&1
WORKER_START_EXIT_CODE=$?
# Wait for workers to fully load models and register (60 seconds)
echo "β³ Waiting 60 seconds for workers to load models and register..."
sleep 60
# Check if workers started successfully by checking the exit code and log
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
echo "β Failed to start workers. Check workers.log for errors."
cleanup
exit 1
fi
# Check if workers are actually running by looking for their processes (updated for new --worker-address format)
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
echo "β Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
cleanup
exit 1
fi
echo "β
Workers started successfully ($RUNNING_WORKERS workers running)"
echo ""
echo "π System is ready!"
echo "================================"
echo "π Web interface: http://localhost:$DISPATCHER_PORT"
echo "π Dispatcher health: http://localhost:$DISPATCHER_PORT"
echo "π§ Worker health checks:"
for ((i=0; i<NUM_GPUS; i++)); do
echo " GPU $i: http://localhost:$((8001 + i))/health"
done
echo ""
echo "π Log files:"
echo " π Analytics (human-readable): system_analytics_*.log"
echo " π₯οΈ GPU metrics (JSON): gpu_metrics_*.jsonl"
echo " π Connection events (JSON): connection_events_*.jsonl"
echo " π Queue metrics (JSON): queue_metrics_*.jsonl"
echo " π IP statistics (JSON): ip_stats_*.jsonl"
echo " π― Dispatcher: dispatcher.log"
echo " π§ Workers summary: workers.log"
for ((i=0; i<NUM_GPUS; i++)); do
echo " π₯οΈ GPU $i worker: worker_gpu_$i.log"
done
echo ""
echo "π‘ Real-time monitoring:"
echo " Human-readable: tail -f system_analytics_*.log"
echo " GPU utilization: tail -f gpu_metrics_*.jsonl"
echo " Connection events: tail -f connection_events_*.jsonl"
echo ""
echo "π Data analysis:"
echo " Summary report: python analyze_analytics.py"
echo " Last 6 hours: python analyze_analytics.py --since 6"
echo " GPU analysis only: python analyze_analytics.py --type gpu"
echo "Press Ctrl+C to stop the system"
echo "================================"
# Keep the script running and wait for interrupt
while true; do
# Check if processes are still running
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
echo "β οΈ Dispatcher process died unexpectedly"
cleanup
exit 1
fi
# Check if workers are still running
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
echo "β οΈ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
echo "π System will continue operating with reduced capacity"
echo "π‘ Check worker logs for error details"
# Don't exit - keep system running with remaining workers
fi
sleep 5
done |