Spaces:
Sleeping
Sleeping
# Cluster connection configuration | |
CLUSTER_HOST="killarney" | |
CLUSTER_USER="gsa" | |
# Job configuration | |
ACCOUNT="aip-craffel" | |
SCRIPT_NAME="gradio_job.slurm" | |
APP_DIR="/project/$ACCOUNT/$CLUSTER_USER/quick-tokenizer-accuracy" | |
APP_PATH="app.py" | |
JOB_NAME="gradio-app" | |
GPU_TYPE="l40s" | |
NUM_GPUS=1 | |
NODES=1 | |
NTASKS_PER_NODE=1 | |
CPUS_PER_TASK=4 | |
### request more memory to run on more models | |
MEM="16G" | |
TIME="02:00:00" | |
GRADIO_PORT=7861 | |
script_location="$APP_DIR/$SCRIPT_NAME" | |
ENV_PATH="/home/$CLUSTER_USER/tokenizers/.venv/bin/activate" | |
OUTPUT_DIR="/project/$ACCOUNT/$CLUSTER_USER/.slurm" | |
# Function to cleanup temporary files | |
cleanup() { | |
echo "Cleaning up..." | |
if [ -f "$SCRIPT_NAME" ]; then | |
rm "$SCRIPT_NAME" | |
fi | |
exit 0 | |
} | |
# Set trap for cleanup on script exit | |
trap cleanup EXIT INT TERM | |
# Generate SLURM job script locally | |
cat > "$SCRIPT_NAME" << EOF | |
#!/bin/bash | |
#SBATCH --job-name=$JOB_NAME | |
#SBATCH --gres=gpu:$GPU_TYPE:$NUM_GPUS | |
#SBATCH --nodes=$NODES | |
#SBATCH --ntasks-per-node=$NTASKS_PER_NODE | |
#SBATCH --cpus-per-task=$CPUS_PER_TASK | |
#SBATCH --mem=$MEM | |
#SBATCH --time=$TIME | |
#SBATCH --account=$ACCOUNT | |
#SBATCH --output=$OUTPUT_DIR/%j.out | |
# Print job info | |
echo "Job started on node: \$(hostname)" | |
echo "Job ID: \$SLURM_JOB_ID" | |
echo "Allocated nodes: \$SLURM_JOB_NODELIST" | |
echo "Working directory: \$(pwd)" | |
echo "Starting time: \$(date)" | |
source /home/$CLUSTER_USER/.bashrc | |
# Load necessary modules | |
module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6 python/3.10.13 | |
# Activate virtual environment | |
source "${ENV_PATH}" | |
echo $HF_TOKEN | |
hf auth login --token $HF_TOKEN | |
hf auth whoami | |
# Set up environment | |
export GRADIO_SERVER_NAME="0.0.0.0" | |
export GRADIO_SERVER_PORT=$GRADIO_PORT | |
# Start Gradio app | |
echo "Starting Gradio app on port ${GRADIO_PORT}..." | |
gradio "${APP_PATH}" --watch-dirs "${APP_DIR}" | |
# python "${APP_PATH}" --watch-dirs "${APP_DIR}" | |
# Keep the job alive | |
echo "Gradio app finished at: \$(date)" | |
EOF | |
echo "Generated SLURM job script: $SCRIPT_NAME" | |
# Transfer the job script to the cluster and submit it | |
scp "$SCRIPT_NAME" "$CLUSTER_USER@$CLUSTER_HOST:$script_location" | |
if [ $? -ne 0 ]; then | |
echo "Error: Failed to transfer job script to cluster" | |
exit 1 | |
fi | |
echo "Submitting job to cluster..." | |
JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" \ | |
"bash -l -c 'cd \"$APP_DIR\" && sbatch --parsable \"$script_location\"'" \ | |
| tr -d '\r\n') | |
if [ $? -ne 0 ]; then | |
echo "Error: Failed to submit job to cluster" | |
exit 1 | |
fi | |
echo "Job submitted with ID: $JOB_ID" | |
# Monitor job status from local machine | |
echo "Monitoring job status from local machine..." | |
while true; do | |
JOB_STATUS=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%T\" 2>/dev/null'") | |
echo "Job status: $JOB_STATUS" | |
if [ -z "$JOB_STATUS" ]; then | |
echo "Error: Job $JOB_ID not found. It may have failed to start." | |
echo "Checking job output..." | |
ssh "$CLUSTER_USER@$CLUSTER_HOST" "ls -la ${JOB_ID}.* 2>/dev/null && echo 'Output files:' && cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null" | |
exit 1 | |
elif [ "$JOB_STATUS" = "RUNNING" ]; then | |
echo "Job is now running!" | |
break | |
elif [ "$JOB_STATUS" = "PENDING" ]; then | |
echo "Job is pending... (waiting for resources)" | |
sleep 5 | |
else | |
echo "Job status: $JOB_STATUS" | |
if [[ "$JOB_STATUS" =~ ^(FAILED|CANCELLED|TIMEOUT|COMPLETED)$ ]]; then | |
echo "Job ended with status: $JOB_STATUS" | |
echo "Checking job output files..." | |
ssh "$CLUSTER_USER@$CLUSTER_HOST" "cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null" | |
exit 1 | |
fi | |
sleep 5 | |
fi | |
done | |
# Get the allocated node | |
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'") | |
echo "Job (${JOB_ID}) is running on node: ${NODE}" | |
# Wait a moment for the Gradio app to start | |
echo "Waiting for Gradio app to initialize..." | |
sleep 10 | |
# Check if Gradio is actually running | |
echo "Checking if Gradio app started successfully..." | |
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null") | |
# Get NODE locally | |
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \ | |
"bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'") | |
# Check Gradio process on that node | |
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \ | |
"bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null") | |
# Handle process check | |
if [ -n "$GRADIO_CHECK" ]; then | |
echo "✓ Gradio app appears to be running" | |
else | |
echo "⚠ Warning: Gradio app may not have started properly" | |
echo "Check the job output:" | |
ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'tail \"${OUTPUT_DIR}/${JOB_ID}.out\"'" | |
fi | |
cancel_job() { | |
read -p "Would you like to cancel the job? (y/n): " -n 1 -r | |
if [[ $REPLY =~ ^[Yy]$ ]]; then | |
ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel ${JOB_ID} '" | |
else | |
echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE" | |
fi | |
} | |
# Optional port forwarding | |
read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r | |
echo "" | |
if [[ $REPLY =~ ^[Yy]$ ]]; then | |
# If GRADIO_PORT is in use locally, pick a random free port | |
if lsof -iTCP:"$GRADIO_PORT" -sTCP:LISTEN >/dev/null 2>&1; then | |
echo "Port $GRADIO_PORT is already in use locally — selecting a free one..." | |
LOCAL_PORT=$(comm -23 \ | |
<(seq 1024 65535 | sort) \ | |
<(lsof -nP -iTCP -sTCP:LISTEN | awk 'NR>1 {print $9}' | awk -F: '{print $NF}' | sort -u) \ | |
| awk 'BEGIN{srand()} {ports[NR]=$0} END{print ports[int(rand()*NR)+1]}') | |
else | |
LOCAL_PORT="$GRADIO_PORT" | |
fi | |
echo "Using local port: $LOCAL_PORT" | |
echo "Setting up port forwarding... Open http://localhost:${LOCAL_PORT} in your browser to access the app." | |
ssh -L "${LOCAL_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \ | |
-t "echo 'Port forwarding active: localhost:${LOCAL_PORT} -> ${NODE}:${GRADIO_PORT}'; bash" | |
echo "" | |
echo "Port forwarding ended." | |
cancel_job | |
else | |
echo "Skipping port forwarding." | |
# Connection info | |
cat <<EOF | |
========================================= | |
Gradio app should be running on: | |
Cluster: $CLUSTER_HOST | |
Node: $NODE | |
Port: $GRADIO_PORT | |
To access from your local machine: | |
ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST | |
Then open: http://localhost:$GRADIO_PORT | |
Alternative direct SSH with forwarding: | |
ssh -L $GRADIO_PORT:localhost:$GRADIO_PORT $CLUSTER_USER@$NODE.$CLUSTER_HOST | |
Check job status: | |
ssh $CLUSTER_USER@$CLUSTER_HOST \"'squeue -j $JOB_ID '\" | |
Cancel job: | |
ssh $CLUSTER_USER@$CLUSTER_HOST \"'scancel $JOB_ID '\" | |
========================================= | |
EOF | |
echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST" | |
fi | |