neural-os / start_system.sh
da03
.
d4eaeb4
raw
history blame
5.61 kB
#!/bin/bash
# Multi-GPU Neural OS Startup Script
# Default values
NUM_GPUS=2
DISPATCHER_PORT=8000
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--num-gpus)
NUM_GPUS="$2"
shift 2
;;
--port)
DISPATCHER_PORT="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [--num-gpus N] [--port PORT]"
echo " --num-gpus N Number of GPU workers to start (default: 2)"
echo " --port PORT Dispatcher port (default: 8000)"
exit 0
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
# Function to cleanup background processes
cleanup() {
echo ""
echo "πŸ›‘ Shutting down system..."
# Kill dispatcher
if [[ -n $DISPATCHER_PID ]]; then
echo "Stopping dispatcher (PID: $DISPATCHER_PID)..."
kill $DISPATCHER_PID 2>/dev/null
wait $DISPATCHER_PID 2>/dev/null
fi
# Kill workers by finding their processes
echo "Stopping workers..."
pkill -f "python.*worker.py.*--worker-address" 2>/dev/null || true
sleep 2
# Force kill if any are still running
pkill -9 -f "python.*worker.py.*--worker-address" 2>/dev/null || true
echo "βœ… System stopped"
exit 0
}
# Set up signal handlers
trap cleanup SIGINT SIGTERM
echo "πŸš€ Starting Multi-GPU Neural OS System"
echo "========================================"
echo "πŸ“Š Number of GPUs: $NUM_GPUS"
echo "🌐 Dispatcher port: $DISPATCHER_PORT"
echo "πŸ’» Worker ports: $(seq -s', ' 8001 $((8000 + NUM_GPUS)))"
echo "πŸ“ˆ Analytics logging: system_analytics_$(date +%Y%m%d_%H%M%S).log"
echo ""
# Check if required files exist
if [[ ! -f "dispatcher.py" ]]; then
echo "❌ Error: dispatcher.py not found"
exit 1
fi
if [[ ! -f "worker.py" ]]; then
echo "❌ Error: worker.py not found"
exit 1
fi
if [[ ! -f "start_workers.py" ]]; then
echo "❌ Error: start_workers.py not found"
exit 1
fi
# Start dispatcher
echo "🎯 Starting dispatcher..."
python dispatcher.py --port $DISPATCHER_PORT > dispatcher.log 2>&1 &
DISPATCHER_PID=$!
# Wait a bit for dispatcher to start
sleep 3
# Check if dispatcher started successfully
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
echo "❌ Failed to start dispatcher. Check dispatcher.log for errors."
exit 1
fi
echo "βœ… Dispatcher started (PID: $DISPATCHER_PID)"
# Start workers
echo "πŸ”§ Starting $NUM_GPUS GPU workers..."
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
WORKER_START_EXIT_CODE=$?
# Wait for workers to fully load models and register (60 seconds)
echo "⏳ Waiting 60 seconds for workers to load models and register..."
sleep 60
# Check if workers started successfully by checking the exit code and log
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
echo "❌ Failed to start workers. Check workers.log for errors."
cleanup
exit 1
fi
# Check if workers are actually running by looking for their processes (updated for new --worker-address format)
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
echo "❌ Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
cleanup
exit 1
fi
echo "βœ… Workers started successfully ($RUNNING_WORKERS workers running)"
echo ""
echo "πŸŽ‰ System is ready!"
echo "================================"
echo "🌍 Web interface: http://localhost:$DISPATCHER_PORT"
echo "πŸ“Š Dispatcher health: http://localhost:$DISPATCHER_PORT"
echo "πŸ”§ Worker health checks:"
for ((i=0; i<NUM_GPUS; i++)); do
echo " GPU $i: http://localhost:$((8001 + i))/health"
done
echo ""
echo "πŸ“‹ Log files:"
echo " πŸ“Š Analytics (human-readable): system_analytics_*.log"
echo " πŸ–₯️ GPU metrics (JSON): gpu_metrics_*.jsonl"
echo " πŸ”— Connection events (JSON): connection_events_*.jsonl"
echo " πŸ“ Queue metrics (JSON): queue_metrics_*.jsonl"
echo " 🌍 IP statistics (JSON): ip_stats_*.jsonl"
echo " 🎯 Dispatcher: dispatcher.log"
echo " πŸ”§ Workers summary: workers.log"
for ((i=0; i<NUM_GPUS; i++)); do
echo " πŸ–₯️ GPU $i worker: worker_gpu_$i.log"
done
echo ""
echo "πŸ’‘ Real-time monitoring:"
echo " Human-readable: tail -f system_analytics_*.log"
echo " GPU utilization: tail -f gpu_metrics_*.jsonl"
echo " Connection events: tail -f connection_events_*.jsonl"
echo ""
echo "πŸ“ˆ Data analysis:"
echo " Summary report: python analyze_analytics.py"
echo " Last 6 hours: python analyze_analytics.py --since 6"
echo " GPU analysis only: python analyze_analytics.py --type gpu"
echo "Press Ctrl+C to stop the system"
echo "================================"
# Keep the script running and wait for interrupt
while true; do
# Check if processes are still running
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
echo "⚠️ Dispatcher process died unexpectedly"
cleanup
exit 1
fi
# Check if workers are still running
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
echo "⚠️ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
echo "πŸ”„ System will continue operating with reduced capacity"
echo "πŸ’‘ Check worker logs for error details"
# Don't exit - keep system running with remaining workers
fi
sleep 5
done