Gül Sena Altıntaş commited on
Commit
6383574
·
1 Parent(s): fb396ec

Fixed hf issue, fixed OOM

Browse files
Files changed (2) hide show
  1. app.py +38 -3
  2. serve_on_killarney.sh +9 -14
app.py CHANGED
@@ -10,12 +10,35 @@ import logging
10
  from typing import List, Dict, Any
11
  import gc
12
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Set up logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
- print("hf_toke_fromglobal", os.environ.get("HF_TOKEN"))
19
  # Model configurations - maps display names to HF model paths
20
  PREDEFINED_MODELS = [
21
  "meta-llama/Llama-3.2-1B",
@@ -92,9 +115,22 @@ def setup_tokenizer(model_path):
92
  return tokenizer
93
 
94
 
95
- def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None):
96
  """Load model and tokenizer with caching"""
97
  global model_cache
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  if use_cache and model_path in model_cache:
100
  logger.info(f"Using cached model: {model_path}")
@@ -125,7 +161,6 @@ def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None)
125
  progress_callback(0.5, f"🧠 Loading model weights for {model_path}... (this may take a while)")
126
 
127
  logger.info(os.getcwd())
128
- logger.info("hf token", os.environ.get("HF_TOKEN"))
129
  # Load model with appropriate settings
130
  model = AutoModelForCausalLM.from_pretrained(
131
  model_path,
 
10
  from typing import List, Dict, Any
11
  import gc
12
  import os
13
+ import psutil
14
+
15
+
16
+ def get_memory_usage():
17
+ """Return (gpu_mem_used_MB, gpu_mem_total_MB, ram_used_MB, ram_total_MB)"""
18
+ # System RAM
19
+ vm = psutil.virtual_memory()
20
+ ram_used_mb = vm.used / (1024 ** 2)
21
+ ram_total_mb = vm.total / (1024 ** 2)
22
+
23
+ # GPU memory
24
+ if torch.cuda.is_available():
25
+ gpu_idx = torch.cuda.current_device()
26
+ torch.cuda.synchronize()
27
+ gpu_mem_alloc = torch.cuda.memory_allocated(gpu_idx) / (1024 ** 2)
28
+ gpu_mem_reserved = torch.cuda.memory_reserved(gpu_idx) / (1024 ** 2)
29
+ gpu_mem_total = torch.cuda.get_device_properties(gpu_idx).total_memory / (1024 ** 2)
30
+ gpu_mem_used = max(gpu_mem_alloc, gpu_mem_reserved) # safe estimate
31
+ else:
32
+ gpu_mem_used = 0
33
+ gpu_mem_total = 0
34
+
35
+ return gpu_mem_used, gpu_mem_total, ram_used_mb, ram_total_mb
36
+
37
 
38
  # Set up logging
39
  logging.basicConfig(level=logging.INFO)
40
  logger = logging.getLogger(__name__)
41
 
 
42
  # Model configurations - maps display names to HF model paths
43
  PREDEFINED_MODELS = [
44
  "meta-llama/Llama-3.2-1B",
 
115
  return tokenizer
116
 
117
 
118
+ def load_model_and_tokenizer(model_path, progress_callback=None):
119
  """Load model and tokenizer with caching"""
120
  global model_cache
121
+
122
+ # Decide caching strategy based on memory usage
123
+ gpu_used, gpu_total, ram_used, ram_total = get_memory_usage()
124
+ logger.info(f"Current GPU memory: {gpu_used:.1f}/{gpu_total:.1f} MB")
125
+ logger.info(f"Current RAM: {ram_used:.1f}/{ram_total:.1f} MB")
126
+
127
+ use_cache = not (
128
+ (gpu_total > 0 and gpu_used / gpu_total > 0.8) or
129
+ (ram_used / ram_total > 0.8)
130
+ ) or model_path in model_cache
131
+ if not use_cache:
132
+ logger.warning("High memory usage detected — disabling model cache.")
133
+
134
 
135
  if use_cache and model_path in model_cache:
136
  logger.info(f"Using cached model: {model_path}")
 
161
  progress_callback(0.5, f"🧠 Loading model weights for {model_path}... (this may take a while)")
162
 
163
  logger.info(os.getcwd())
 
164
  # Load model with appropriate settings
165
  model = AutoModelForCausalLM.from_pretrained(
166
  model_path,
serve_on_killarney.sh CHANGED
@@ -15,7 +15,8 @@ NUM_GPUS=1
15
  NODES=1
16
  NTASKS_PER_NODE=1
17
  CPUS_PER_TASK=4
18
- MEM="8G"
 
19
  TIME="02:00:00"
20
  GRADIO_PORT=7861
21
  script_location="$APP_DIR/$SCRIPT_NAME"
@@ -62,6 +63,9 @@ module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6
62
 
63
  # Activate virtual environment
64
  source "${ENV_PATH}"
 
 
 
65
 
66
  # Set up environment
67
  export GRADIO_SERVER_NAME="0.0.0.0"
@@ -70,6 +74,7 @@ export GRADIO_SERVER_PORT=$GRADIO_PORT
70
  # Start Gradio app
71
  echo "Starting Gradio app on port ${GRADIO_PORT}..."
72
  gradio "${APP_PATH}" --watch-dirs "${APP_DIR}"
 
73
 
74
  # Keep the job alive
75
  echo "Gradio app finished at: \$(date)"
@@ -85,8 +90,6 @@ if [ $? -ne 0 ]; then
85
  fi
86
 
87
  echo "Submitting job to cluster..."
88
- # JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'cd $APP_DIR && sbatch --parsable $script_location'")
89
- # ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'tail \"${OUTPUT_DIR}/${JOB_ID}.out\"'"
90
 
91
  JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" \
92
  "bash -l -c 'cd \"$APP_DIR\" && sbatch --parsable \"$script_location\"'" \
@@ -139,7 +142,6 @@ sleep 10
139
 
140
  # Check if Gradio is actually running
141
  echo "Checking if Gradio app started successfully..."
142
- # GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "ssh $NODE 'ps aux | grep gradio | grep -v grep' 2>/dev/null")
143
  GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
144
 
145
  # Get NODE locally
@@ -163,10 +165,9 @@ fi
163
  cancel_job() {
164
  read -p "Would you like to cancel the job? (y/n): " -n 1 -r
165
  if [[ $REPLY =~ ^[Yy]$ ]]; then
166
- ## job id known only remotely
167
- # ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel \${JOB_ID}'"
168
  ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel ${JOB_ID} '"
169
- # ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel ${JOB_ID}'"
 
170
  fi
171
  }
172
 
@@ -174,8 +175,6 @@ cancel_job() {
174
  read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r
175
  echo ""
176
  if [[ $REPLY =~ ^[Yy]$ ]]; then
177
- # ssh -L "${GRADIO_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
178
- # -t "echo 'Port forwarding active: localhost:${GRADIO_PORT} -> ${NODE}:${GRADIO_PORT}'; bash
179
  # If GRADIO_PORT is in use locally, pick a random free port
180
  if lsof -iTCP:"$GRADIO_PORT" -sTCP:LISTEN >/dev/null 2>&1; then
181
  echo "Port $GRADIO_PORT is already in use locally — selecting a free one..."
@@ -189,7 +188,7 @@ if [[ $REPLY =~ ^[Yy]$ ]]; then
189
 
190
  echo "Using local port: $LOCAL_PORT"
191
 
192
- echo "Setting up port forwarding... Open https://localhost:${LOCAL_PORT} in your browser to access the app."
193
  ssh -L "${LOCAL_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
194
  -t "echo 'Port forwarding active: localhost:${LOCAL_PORT} -> ${NODE}:${GRADIO_PORT}'; bash"
195
 
@@ -199,7 +198,6 @@ if [[ $REPLY =~ ^[Yy]$ ]]; then
199
  cancel_job
200
  else
201
  echo "Skipping port forwarding."
202
-
203
  # Connection info
204
  cat <<EOF
205
 
@@ -227,7 +225,4 @@ EOF
227
  echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST"
228
  fi
229
 
230
- echo ""
231
- echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE"
232
- # echo "Don't forget to cancel it when done: ssh $CLUSTER_USER@$CLUSTER_HOST 'scancel $JOB_ID'"
233
 
 
15
  NODES=1
16
  NTASKS_PER_NODE=1
17
  CPUS_PER_TASK=4
18
+ ### request more memory to run on more models
19
+ MEM="16G"
20
  TIME="02:00:00"
21
  GRADIO_PORT=7861
22
  script_location="$APP_DIR/$SCRIPT_NAME"
 
63
 
64
  # Activate virtual environment
65
  source "${ENV_PATH}"
66
+ echo $HF_TOKEN
67
+ hf auth login --token $HF_TOKEN
68
+ hf auth whoami
69
 
70
  # Set up environment
71
  export GRADIO_SERVER_NAME="0.0.0.0"
 
74
  # Start Gradio app
75
  echo "Starting Gradio app on port ${GRADIO_PORT}..."
76
  gradio "${APP_PATH}" --watch-dirs "${APP_DIR}"
77
+ # python "${APP_PATH}" --watch-dirs "${APP_DIR}"
78
 
79
  # Keep the job alive
80
  echo "Gradio app finished at: \$(date)"
 
90
  fi
91
 
92
  echo "Submitting job to cluster..."
 
 
93
 
94
  JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" \
95
  "bash -l -c 'cd \"$APP_DIR\" && sbatch --parsable \"$script_location\"'" \
 
142
 
143
  # Check if Gradio is actually running
144
  echo "Checking if Gradio app started successfully..."
 
145
  GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
146
 
147
  # Get NODE locally
 
165
  cancel_job() {
166
  read -p "Would you like to cancel the job? (y/n): " -n 1 -r
167
  if [[ $REPLY =~ ^[Yy]$ ]]; then
 
 
168
  ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel ${JOB_ID} '"
169
+ else
170
+ echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE"
171
  fi
172
  }
173
 
 
175
  read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r
176
  echo ""
177
  if [[ $REPLY =~ ^[Yy]$ ]]; then
 
 
178
  # If GRADIO_PORT is in use locally, pick a random free port
179
  if lsof -iTCP:"$GRADIO_PORT" -sTCP:LISTEN >/dev/null 2>&1; then
180
  echo "Port $GRADIO_PORT is already in use locally — selecting a free one..."
 
188
 
189
  echo "Using local port: $LOCAL_PORT"
190
 
191
+ echo "Setting up port forwarding... Open http://localhost:${LOCAL_PORT} in your browser to access the app."
192
  ssh -L "${LOCAL_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
193
  -t "echo 'Port forwarding active: localhost:${LOCAL_PORT} -> ${NODE}:${GRADIO_PORT}'; bash"
194
 
 
198
  cancel_job
199
  else
200
  echo "Skipping port forwarding."
 
201
  # Connection info
202
  cat <<EOF
203
 
 
225
  echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST"
226
  fi
227
 
 
 
 
228