Spaces:

r-three
/

quick-tokenizer-accuracy

Running

App Files Files Community

Gül Sena Altıntaş commited on 8 days ago

Commit

1d3a5fe

1 Parent(s): cb0e70e

Killarney script

Browse files

Files changed (1) hide show

serve_on_killarney.sh +196 -0

serve_on_killarney.sh ADDED Viewed

	@@ -0,0 +1,196 @@

+#!/bin/bash
+# Cluster connection configuration
+CLUSTER_HOST="killarney"
+CLUSTER_USER="gsa"
+# Job configuration
+SCRIPT_NAME="gradio_job.slurm"
+APP_PATH="/home/gsa/quick-tokenizer-accuracy/app.py"
+JOB_NAME="gradio-app"
+PARTITION="l40s"
+NODES=1
+NTASKS_PER_NODE=1
+CPUS_PER_TASK=4
+MEM="8G"
+TIME="02:00:00"
+GRADIO_PORT=7860
+ACCOUNT="aip-craffel"
+script_location="/project/aip-craffel/gsa/$SCRIPT_NAME"
+ENV_PATH="/home/gsa/tokenizers/.venv/bin/activate"
+OUTPUT_PATH="/project/aip-craffel/gsa/.slurm"
+# Function to cleanup temporary files
+cleanup() {
+    echo "Cleaning up..."
+    if [ -f "$SCRIPT_NAME" ]; then
+        rm "$SCRIPT_NAME"
+    fi
+    exit 0
+}
+# Set trap for cleanup on script exit
+trap cleanup EXIT INT TERM
+# Generate SLURM job script locally
+cat > "$SCRIPT_NAME" << EOF
+#!/bin/bash
+#SBATCH --job-name=$JOB_NAME
+#SBATCH --partition=$PARTITION
+#SBATCH --nodes=$NODES
+#SBATCH --ntasks-per-node=$NTASKS_PER_NODE
+#SBATCH --cpus-per-task=$CPUS_PER_TASK
+#SBATCH --mem=$MEM
+#SBATCH --time=$TIME
+#SBATCH --account=$ACCOUNT
+#SBATCH --output=$OUTPUT_PATH/%j.out
+# Print job info
+echo "Job started on node: \$(hostname)"
+echo "Job ID: \$SLURM_JOB_ID"
+echo "Allocated nodes: \$SLURM_JOB_NODELIST"
+echo "Working directory: \$(pwd)"
+echo "Starting time: \$(date)"
+# Load necessary modules
+module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6 python/3.10.13
+# Activate virtual environment
+source $ENV_PATH
+# Set up environment
+export GRADIO_SERVER_NAME="0.0.0.0"
+export GRADIO_SERVER_PORT=$GRADIO_PORT
+# Start Gradio app
+echo "Starting Gradio app on port $GRADIO_PORT..."
+python $APP_PATH --no-browser
+# Keep the job alive
+echo "Gradio app finished at: \$(date)"
+EOF
+echo "Generated SLURM job script: $SCRIPT_NAME"
+# Transfer the job script to the cluster and submit it
+scp "$SCRIPT_NAME" "$CLUSTER_USER@$CLUSTER_HOST:$script_location"
+if [ $? -ne 0 ]; then
+    echo "Error: Failed to transfer job script to cluster"
+    exit 1
+fi
+echo "Submitting job to cluster..."
+JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'cd /project && sbatch --parsable $script_location'")
+if [ $? -ne 0 ]; then
+    echo "Error: Failed to submit job to cluster"
+    exit 1
+fi
+echo "Job submitted with ID: $JOB_ID"
+# Monitor job status from local machine
+echo "Monitoring job status from local machine..."
+while true; do
+    JOB_STATUS=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%T\" 2>/dev/null'")
+    echo "Job status: $JOB_STATUS"
+    if [ -z "$JOB_STATUS" ]; then
+        echo "Error: Job $JOB_ID not found. It may have failed to start."
+        echo "Checking job output..."
+        ssh "$CLUSTER_USER@$CLUSTER_HOST" "ls -la ${JOB_ID}.* 2>/dev/null && echo 'Output files:' && cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
+        exit 1
+    elif [ "$JOB_STATUS" = "RUNNING" ]; then
+        echo "Job is now running!"
+        break
+    elif [ "$JOB_STATUS" = "PENDING" ]; then
+        echo "Job is pending... (waiting for resources)"
+        sleep 5
+    else
+        echo "Job status: $JOB_STATUS"
+        if [[ "$JOB_STATUS" =~ ^(FAILED|CANCELLED|TIMEOUT|COMPLETED)$ ]]; then
+            echo "Job ended with status: $JOB_STATUS"
+            echo "Checking job output files..."
+            ssh "$CLUSTER_USER@$CLUSTER_HOST" "cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
+            exit 1
+        fi
+        sleep 5
+    fi
+done
+# Get the allocated node
+NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")
+echo "Job is running on node: $NODE"
+# Wait a moment for the Gradio app to start
+echo "Waiting for Gradio app to initialize..."
+sleep 10
+# Check if Gradio is actually running
+echo "Checking if Gradio app started successfully..."
+# GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "ssh $NODE 'ps aux | grep gradio | grep -v grep' 2>/dev/null")
+GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
+# Get NODE locally
+NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
+    "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")
+# Check Gradio process on that node
+GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
+    "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
+# Handle process check
+if [ -n "$GRADIO_CHECK" ]; then
+    echo "✓ Gradio app appears to be running"
+else
+    echo "⚠ Warning: Gradio app may not have started properly"
+    echo "Check the job output:"
+    ssh "$CLUSTER_USER@$CLUSTER_HOST" \
+        "bash -l -c 'tail ${JOB_ID}.out'"
+fi
+# Connection info
+cat <<EOF
+=========================================
+Gradio app should be running on:
+  Cluster: $CLUSTER_HOST
+  Node: $NODE
+  Port: $GRADIO_PORT
+To access from your local machine:
+  ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST
+  Then open: http://localhost:$GRADIO_PORT
+Alternative direct SSH with forwarding:
+  ssh -L $GRADIO_PORT:localhost:$GRADIO_PORT $CLUSTER_USER@$NODE.$CLUSTER_HOST
+Check job status:
+  ssh $CLUSTER_USER@$CLUSTER_HOST 'squeue -j $JOB_ID '
+Cancel job:
+  ssh $CLUSTER_USER@$CLUSTER_HOST 'scancel $JOB_ID '
+=========================================
+EOF
+# Optional port forwarding
+read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r
+echo ""
+if [[ $REPLY =~ ^[Yy]$ ]]; then
+    echo "Setting up port forwarding..."
+    ssh -L "${GRADIO_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
+    -t "echo 'Port forwarding active: localhost:${GRADIO_PORT} -> ${NODE}:${GRADIO_PORT}'; bash"
+    echo ""
+    echo "Port forwarding ended."
+else
+    echo "Skipping port forwarding."
+    echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST"
+fi
+echo ""
+echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE"
+echo "Don't forget to cancel it when done: ssh $CLUSTER_USER@$CLUSTER_HOST 'scancel $JOB_ID'"