Spaces:
Runtime error
Runtime error
| # Multi-GPU Neural OS Startup Script | |
| # Function to detect number of GPUs automatically | |
| detect_gpu_count() { | |
| if command -v nvidia-smi >/dev/null 2>&1; then | |
| # Use nvidia-smi to count GPUs | |
| local gpu_count=$(nvidia-smi -L 2>/dev/null | wc -l) | |
| if [ "$gpu_count" -gt 0 ]; then | |
| echo "$gpu_count" | |
| return 0 | |
| fi | |
| fi | |
| # If nvidia-smi fails, try alternative methods | |
| if [ -d "/proc/driver/nvidia/gpus" ]; then | |
| local gpu_count=$(ls -d /proc/driver/nvidia/gpus/*/information 2>/dev/null | wc -l) | |
| if [ "$gpu_count" -gt 0 ]; then | |
| echo "$gpu_count" | |
| return 0 | |
| fi | |
| fi | |
| # Default fallback | |
| echo "1" | |
| return 1 | |
| } | |
| # Detect GPU count automatically | |
| DETECTED_GPUS=$(detect_gpu_count) | |
| GPU_DETECTION_SUCCESS=$? | |
| # Default values | |
| NUM_GPUS=$DETECTED_GPUS | |
| DISPATCHER_PORT=7860 | |
| # Parse command line arguments | |
| while [[ $# -gt 0 ]]; do | |
| case $1 in | |
| --num-gpus) | |
| NUM_GPUS="$2" | |
| shift 2 | |
| ;; | |
| --port) | |
| DISPATCHER_PORT="$2" | |
| shift 2 | |
| ;; | |
| -h|--help) | |
| echo "Usage: $0 [--num-gpus N] [--port PORT]" | |
| echo " --num-gpus N Number of GPU workers to start (default: auto-detected)" | |
| echo " --port PORT Dispatcher port (default: 7860)" | |
| echo "" | |
| echo "GPU Detection:" | |
| echo " Automatically detects available GPUs using nvidia-smi" | |
| echo " Currently detected: $DETECTED_GPUS GPU(s)" | |
| if [ $GPU_DETECTION_SUCCESS -ne 0 ]; then | |
| echo " β οΈ GPU detection failed - using fallback of 1 GPU" | |
| fi | |
| exit 0 | |
| ;; | |
| *) | |
| echo "Unknown option: $1" >&2 | |
| exit 1 | |
| ;; | |
| esac | |
| done | |
| # Function to cleanup background processes | |
| cleanup() { | |
| echo "" | |
| echo "π Shutting down system..." | |
| # Kill dispatcher | |
| if [[ -n $DISPATCHER_PID ]]; then | |
| echo "Stopping dispatcher (PID: $DISPATCHER_PID)..." | |
| kill $DISPATCHER_PID 2>/dev/null | |
| wait $DISPATCHER_PID 2>/dev/null | |
| fi | |
| # Kill workers by finding their processes | |
| echo "Stopping workers..." | |
| pkill -f "python.*worker.py.*--worker-address" 2>/dev/null || true | |
| sleep 2 | |
| # Force kill if any are still running | |
| pkill -9 -f "python.*worker.py.*--worker-address" 2>/dev/null || true | |
| echo "β System stopped" | |
| exit 0 | |
| } | |
| # Set up signal handlers | |
| trap cleanup SIGINT SIGTERM | |
| echo "π Starting Multi-GPU Neural OS System" | |
| echo "========================================" | |
| echo "π GPU Detection: $DETECTED_GPUS GPU(s) detected" | |
| if [ $GPU_DETECTION_SUCCESS -ne 0 ]; then | |
| echo "β οΈ GPU detection failed - using fallback count" | |
| elif command -v nvidia-smi >/dev/null 2>&1; then | |
| echo "π Detected GPUs:" | |
| nvidia-smi -L 2>/dev/null | sed 's/^/ /' | |
| fi | |
| echo "π Number of GPUs: $NUM_GPUS" | |
| echo "π Dispatcher port: $DISPATCHER_PORT" | |
| echo "π» Worker ports: $(seq -s', ' 8001 $((8000 + NUM_GPUS)))" | |
| echo "π Analytics logging: system_analytics_$(date +%Y%m%d_%H%M%S).log" | |
| echo "" | |
| # Validate that we're not trying to start more workers than GPUs | |
| if [ "$NUM_GPUS" -gt "$DETECTED_GPUS" ]; then | |
| echo "β οΈ Warning: Trying to start $NUM_GPUS workers but only $DETECTED_GPUS GPU(s) detected" | |
| echo " This may cause GPU sharing or errors. Consider using --num-gpus $DETECTED_GPUS" | |
| echo "" | |
| fi | |
| # Check if required files exist | |
| if [[ ! -f "dispatcher.py" ]]; then | |
| echo "β Error: dispatcher.py not found" | |
| exit 1 | |
| fi | |
| if [[ ! -f "worker.py" ]]; then | |
| echo "β Error: worker.py not found" | |
| exit 1 | |
| fi | |
| if [[ ! -f "start_workers.py" ]]; then | |
| echo "β Error: start_workers.py not found" | |
| exit 1 | |
| fi | |
| # Start dispatcher | |
| echo "π― Starting dispatcher..." | |
| python dispatcher.py --port $DISPATCHER_PORT > dispatcher.log 2>&1 & | |
| DISPATCHER_PID=$! | |
| # Wait a bit for dispatcher to start | |
| sleep 3 | |
| # Check if dispatcher started successfully | |
| if ! kill -0 $DISPATCHER_PID 2>/dev/null; then | |
| echo "β Failed to start dispatcher. Check dispatcher.log for errors." | |
| exit 1 | |
| fi | |
| echo "β Dispatcher started (PID: $DISPATCHER_PID)" | |
| # Start workers | |
| echo "π§ Starting $NUM_GPUS GPU workers..." | |
| python start_workers.py --num-gpus $NUM_GPUS --dispatcher-url "http://localhost:$DISPATCHER_PORT" --no-monitor > workers.log 2>&1 | |
| WORKER_START_EXIT_CODE=$? | |
| # Wait for workers to fully load models and register (60 seconds) | |
| echo "β³ Waiting 60 seconds for workers to load models and register..." | |
| sleep 60 | |
| # Check if workers started successfully by checking the exit code and log | |
| if [ $WORKER_START_EXIT_CODE -ne 0 ]; then | |
| echo "β Failed to start workers. Check workers.log for errors." | |
| cleanup | |
| exit 1 | |
| fi | |
| # Check if workers are actually running by looking for their processes (updated for new --worker-address format) | |
| RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0") | |
| if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then | |
| echo "β Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors." | |
| cleanup | |
| exit 1 | |
| fi | |
| echo "β Workers started successfully ($RUNNING_WORKERS workers running)" | |
| echo "" | |
| echo "π System is ready!" | |
| echo "================================" | |
| echo "π Web interface: http://localhost:$DISPATCHER_PORT" | |
| echo "π Dispatcher health: http://localhost:$DISPATCHER_PORT" | |
| echo "π§ Worker health checks:" | |
| for ((i=0; i<NUM_GPUS; i++)); do | |
| echo " GPU $i: http://localhost:$((8001 + i))/health" | |
| done | |
| echo "" | |
| echo "π Log files:" | |
| echo " π Analytics (human-readable): system_analytics_*.log" | |
| echo " π₯οΈ GPU metrics (JSON): gpu_metrics_*.jsonl" | |
| echo " π Connection events (JSON): connection_events_*.jsonl" | |
| echo " π Queue metrics (JSON): queue_metrics_*.jsonl" | |
| echo " π IP statistics (JSON): ip_stats_*.jsonl" | |
| echo " π― Dispatcher: dispatcher.log" | |
| echo " π§ Workers summary: workers.log" | |
| for ((i=0; i<NUM_GPUS; i++)); do | |
| echo " π₯οΈ GPU $i worker: worker_gpu_$i.log" | |
| done | |
| echo "" | |
| echo "π‘ Real-time monitoring:" | |
| echo " Human-readable: tail -f system_analytics_*.log" | |
| echo " GPU utilization: tail -f gpu_metrics_*.jsonl" | |
| echo " Connection events: tail -f connection_events_*.jsonl" | |
| echo "" | |
| echo "π Data analysis:" | |
| echo " Summary report: python analyze_analytics.py" | |
| echo " Last 6 hours: python analyze_analytics.py --since 6" | |
| echo " GPU analysis only: python analyze_analytics.py --type gpu" | |
| echo "Press Ctrl+C to stop the system" | |
| echo "================================" | |
| # Keep the script running and wait for interrupt | |
| while true; do | |
| # Check if processes are still running | |
| if ! kill -0 $DISPATCHER_PID 2>/dev/null; then | |
| echo "β οΈ Dispatcher process died unexpectedly" | |
| cleanup | |
| exit 1 | |
| fi | |
| # Check if workers are still running | |
| CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0") | |
| if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then | |
| echo "β οΈ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS" | |
| echo "π System will continue operating with reduced capacity" | |
| echo "π‘ Check worker logs for error details" | |
| # Don't exit - keep system running with remaining workers | |
| fi | |
| sleep 5 | |
| done |