Gül Sena Altıntaş commited on
Commit
1d3a5fe
·
1 Parent(s): cb0e70e

Killarney script

Browse files
Files changed (1) hide show
  1. serve_on_killarney.sh +196 -0
serve_on_killarney.sh ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Cluster connection configuration
4
+ CLUSTER_HOST="killarney"
5
+ CLUSTER_USER="gsa"
6
+
7
+ # Job configuration
8
+ SCRIPT_NAME="gradio_job.slurm"
9
+ APP_PATH="/home/gsa/quick-tokenizer-accuracy/app.py"
10
+ JOB_NAME="gradio-app"
11
+ PARTITION="l40s"
12
+ NODES=1
13
+ NTASKS_PER_NODE=1
14
+ CPUS_PER_TASK=4
15
+ MEM="8G"
16
+ TIME="02:00:00"
17
+ GRADIO_PORT=7860
18
+ ACCOUNT="aip-craffel"
19
+ script_location="/project/aip-craffel/gsa/$SCRIPT_NAME"
20
+
21
+ ENV_PATH="/home/gsa/tokenizers/.venv/bin/activate"
22
+ OUTPUT_PATH="/project/aip-craffel/gsa/.slurm"
23
+
24
+ # Function to cleanup temporary files
25
+ cleanup() {
26
+ echo "Cleaning up..."
27
+ if [ -f "$SCRIPT_NAME" ]; then
28
+ rm "$SCRIPT_NAME"
29
+ fi
30
+ exit 0
31
+ }
32
+
33
+ # Set trap for cleanup on script exit
34
+ trap cleanup EXIT INT TERM
35
+
36
+ # Generate SLURM job script locally
37
+ cat > "$SCRIPT_NAME" << EOF
38
+ #!/bin/bash
39
+ #SBATCH --job-name=$JOB_NAME
40
+ #SBATCH --partition=$PARTITION
41
+ #SBATCH --nodes=$NODES
42
+ #SBATCH --ntasks-per-node=$NTASKS_PER_NODE
43
+ #SBATCH --cpus-per-task=$CPUS_PER_TASK
44
+ #SBATCH --mem=$MEM
45
+ #SBATCH --time=$TIME
46
+ #SBATCH --account=$ACCOUNT
47
+ #SBATCH --output=$OUTPUT_PATH/%j.out
48
+
49
+ # Print job info
50
+ echo "Job started on node: \$(hostname)"
51
+ echo "Job ID: \$SLURM_JOB_ID"
52
+ echo "Allocated nodes: \$SLURM_JOB_NODELIST"
53
+ echo "Working directory: \$(pwd)"
54
+ echo "Starting time: \$(date)"
55
+
56
+ # Load necessary modules
57
+ module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6 python/3.10.13
58
+
59
+ # Activate virtual environment
60
+ source $ENV_PATH
61
+
62
+ # Set up environment
63
+ export GRADIO_SERVER_NAME="0.0.0.0"
64
+ export GRADIO_SERVER_PORT=$GRADIO_PORT
65
+
66
+ # Start Gradio app
67
+ echo "Starting Gradio app on port $GRADIO_PORT..."
68
+ python $APP_PATH --no-browser
69
+
70
+ # Keep the job alive
71
+ echo "Gradio app finished at: \$(date)"
72
+ EOF
73
+
74
+ echo "Generated SLURM job script: $SCRIPT_NAME"
75
+
76
+ # Transfer the job script to the cluster and submit it
77
+ scp "$SCRIPT_NAME" "$CLUSTER_USER@$CLUSTER_HOST:$script_location"
78
+ if [ $? -ne 0 ]; then
79
+ echo "Error: Failed to transfer job script to cluster"
80
+ exit 1
81
+ fi
82
+
83
+ echo "Submitting job to cluster..."
84
+ JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'cd /project && sbatch --parsable $script_location'")
85
+
86
+ if [ $? -ne 0 ]; then
87
+ echo "Error: Failed to submit job to cluster"
88
+ exit 1
89
+ fi
90
+
91
+ echo "Job submitted with ID: $JOB_ID"
92
+
93
+
94
+ # Monitor job status from local machine
95
+ echo "Monitoring job status from local machine..."
96
+ while true; do
97
+ JOB_STATUS=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%T\" 2>/dev/null'")
98
+ echo "Job status: $JOB_STATUS"
99
+
100
+ if [ -z "$JOB_STATUS" ]; then
101
+ echo "Error: Job $JOB_ID not found. It may have failed to start."
102
+ echo "Checking job output..."
103
+ ssh "$CLUSTER_USER@$CLUSTER_HOST" "ls -la ${JOB_ID}.* 2>/dev/null && echo 'Output files:' && cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
104
+ exit 1
105
+ elif [ "$JOB_STATUS" = "RUNNING" ]; then
106
+ echo "Job is now running!"
107
+ break
108
+ elif [ "$JOB_STATUS" = "PENDING" ]; then
109
+ echo "Job is pending... (waiting for resources)"
110
+ sleep 5
111
+ else
112
+ echo "Job status: $JOB_STATUS"
113
+ if [[ "$JOB_STATUS" =~ ^(FAILED|CANCELLED|TIMEOUT|COMPLETED)$ ]]; then
114
+ echo "Job ended with status: $JOB_STATUS"
115
+ echo "Checking job output files..."
116
+ ssh "$CLUSTER_USER@$CLUSTER_HOST" "cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
117
+ exit 1
118
+ fi
119
+ sleep 5
120
+ fi
121
+ done
122
+
123
+ # Get the allocated node
124
+ NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")
125
+ echo "Job is running on node: $NODE"
126
+
127
+ # Wait a moment for the Gradio app to start
128
+ echo "Waiting for Gradio app to initialize..."
129
+ sleep 10
130
+
131
+ # Check if Gradio is actually running
132
+ echo "Checking if Gradio app started successfully..."
133
+ # GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "ssh $NODE 'ps aux | grep gradio | grep -v grep' 2>/dev/null")
134
+ GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
135
+
136
+ # Get NODE locally
137
+ NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
138
+ "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")
139
+
140
+ # Check Gradio process on that node
141
+ GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
142
+ "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
143
+
144
+ # Handle process check
145
+ if [ -n "$GRADIO_CHECK" ]; then
146
+ echo "✓ Gradio app appears to be running"
147
+ else
148
+ echo "⚠ Warning: Gradio app may not have started properly"
149
+ echo "Check the job output:"
150
+ ssh "$CLUSTER_USER@$CLUSTER_HOST" \
151
+ "bash -l -c 'tail ${JOB_ID}.out'"
152
+ fi
153
+
154
+ # Connection info
155
+ cat <<EOF
156
+
157
+ =========================================
158
+ Gradio app should be running on:
159
+ Cluster: $CLUSTER_HOST
160
+ Node: $NODE
161
+ Port: $GRADIO_PORT
162
+
163
+ To access from your local machine:
164
+ ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST
165
+ Then open: http://localhost:$GRADIO_PORT
166
+
167
+ Alternative direct SSH with forwarding:
168
+ ssh -L $GRADIO_PORT:localhost:$GRADIO_PORT $CLUSTER_USER@$NODE.$CLUSTER_HOST
169
+
170
+ Check job status:
171
+ ssh $CLUSTER_USER@$CLUSTER_HOST 'squeue -j $JOB_ID '
172
+
173
+ Cancel job:
174
+ ssh $CLUSTER_USER@$CLUSTER_HOST 'scancel $JOB_ID '
175
+ =========================================
176
+
177
+ EOF
178
+
179
+ # Optional port forwarding
180
+ read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r
181
+ echo ""
182
+ if [[ $REPLY =~ ^[Yy]$ ]]; then
183
+ echo "Setting up port forwarding..."
184
+ ssh -L "${GRADIO_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
185
+ -t "echo 'Port forwarding active: localhost:${GRADIO_PORT} -> ${NODE}:${GRADIO_PORT}'; bash"
186
+ echo ""
187
+ echo "Port forwarding ended."
188
+ else
189
+ echo "Skipping port forwarding."
190
+ echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST"
191
+ fi
192
+
193
+ echo ""
194
+ echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE"
195
+ echo "Don't forget to cancel it when done: ssh $CLUSTER_USER@$CLUSTER_HOST 'scancel $JOB_ID'"
196
+