Spaces:

r-three
/

quick-tokenizer-accuracy

Running

App Files Files Community

Gül Sena Altıntaş commited on 3 days ago

Commit

6383574

1 Parent(s): fb396ec

Fixed hf issue, fixed OOM

Browse files

Files changed (2) hide show

app.py +38 -3
serve_on_killarney.sh +9 -14

app.py CHANGED Viewed

@@ -10,12 +10,35 @@ import logging
 from typing import List, Dict, Any
 import gc
 import os
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-print("hf_toke_fromglobal", os.environ.get("HF_TOKEN"))
 # Model configurations - maps display names to HF model paths
 PREDEFINED_MODELS = [
      "meta-llama/Llama-3.2-1B",
@@ -92,9 +115,22 @@ def setup_tokenizer(model_path):
     return tokenizer
-def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None):
     """Load model and tokenizer with caching"""
     global model_cache
     if use_cache and model_path in model_cache:
         logger.info(f"Using cached model: {model_path}")
@@ -125,7 +161,6 @@ def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None)
             progress_callback(0.5, f"🧠 Loading model weights for {model_path}... (this may take a while)")
         logger.info(os.getcwd())
-        logger.info("hf token", os.environ.get("HF_TOKEN"))
         # Load model with appropriate settings
         model = AutoModelForCausalLM.from_pretrained(
             model_path,

 from typing import List, Dict, Any
 import gc
 import os
+import psutil
+def get_memory_usage():
+    """Return (gpu_mem_used_MB, gpu_mem_total_MB, ram_used_MB, ram_total_MB)"""
+    # System RAM
+    vm = psutil.virtual_memory()
+    ram_used_mb = vm.used / (1024 ** 2)
+    ram_total_mb = vm.total / (1024 ** 2)
+    # GPU memory
+    if torch.cuda.is_available():
+        gpu_idx = torch.cuda.current_device()
+        torch.cuda.synchronize()
+        gpu_mem_alloc = torch.cuda.memory_allocated(gpu_idx) / (1024 ** 2)
+        gpu_mem_reserved = torch.cuda.memory_reserved(gpu_idx) / (1024 ** 2)
+        gpu_mem_total = torch.cuda.get_device_properties(gpu_idx).total_memory / (1024 ** 2)
+        gpu_mem_used = max(gpu_mem_alloc, gpu_mem_reserved)  # safe estimate
+    else:
+        gpu_mem_used = 0
+        gpu_mem_total = 0
+    return gpu_mem_used, gpu_mem_total, ram_used_mb, ram_total_mb
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Model configurations - maps display names to HF model paths
 PREDEFINED_MODELS = [
      "meta-llama/Llama-3.2-1B",
     return tokenizer
+def load_model_and_tokenizer(model_path, progress_callback=None):
     """Load model and tokenizer with caching"""
     global model_cache
+    # Decide caching strategy based on memory usage
+    gpu_used, gpu_total, ram_used, ram_total = get_memory_usage()
+    logger.info(f"Current GPU memory: {gpu_used:.1f}/{gpu_total:.1f} MB")
+    logger.info(f"Current RAM: {ram_used:.1f}/{ram_total:.1f} MB")
+    use_cache = not (
+        (gpu_total > 0 and gpu_used / gpu_total > 0.8) or
+        (ram_used / ram_total > 0.8)
+    ) or model_path in model_cache
+    if not use_cache:
+        logger.warning("High memory usage detected — disabling model cache.")
     if use_cache and model_path in model_cache:
         logger.info(f"Using cached model: {model_path}")
             progress_callback(0.5, f"🧠 Loading model weights for {model_path}... (this may take a while)")
         logger.info(os.getcwd())
         # Load model with appropriate settings
         model = AutoModelForCausalLM.from_pretrained(
             model_path,

serve_on_killarney.sh CHANGED Viewed

@@ -15,7 +15,8 @@ NUM_GPUS=1
 NODES=1
 NTASKS_PER_NODE=1
 CPUS_PER_TASK=4
-MEM="8G"
 TIME="02:00:00"
 GRADIO_PORT=7861
 script_location="$APP_DIR/$SCRIPT_NAME"
@@ -62,6 +63,9 @@ module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6
 # Activate virtual environment
 source "${ENV_PATH}"
 # Set up environment
 export GRADIO_SERVER_NAME="0.0.0.0"
@@ -70,6 +74,7 @@ export GRADIO_SERVER_PORT=$GRADIO_PORT
 # Start Gradio app
 echo "Starting Gradio app on port ${GRADIO_PORT}..."
 gradio "${APP_PATH}" --watch-dirs "${APP_DIR}"
 # Keep the job alive
 echo "Gradio app finished at: \$(date)"
@@ -85,8 +90,6 @@ if [ $? -ne 0 ]; then
 fi
 echo "Submitting job to cluster..."
-# JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'cd $APP_DIR && sbatch --parsable $script_location'")
-#     ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'tail \"${OUTPUT_DIR}/${JOB_ID}.out\"'"
 JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" \
     "bash -l -c 'cd \"$APP_DIR\" && sbatch --parsable \"$script_location\"'" \
@@ -139,7 +142,6 @@ sleep 10
 # Check if Gradio is actually running
 echo "Checking if Gradio app started successfully..."
-# GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "ssh $NODE 'ps aux | grep gradio | grep -v grep' 2>/dev/null")
 GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
 # Get NODE locally
@@ -163,10 +165,9 @@ fi
 cancel_job() {
     read -p "Would you like to cancel the job? (y/n): " -n 1 -r
     if [[ $REPLY =~ ^[Yy]$ ]]; then
-        ## job id known only remotely
-        # ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel \${JOB_ID}'"
         ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel  ${JOB_ID} '"
-        # ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel ${JOB_ID}'"
     fi
 }
@@ -174,8 +175,6 @@ cancel_job() {
 read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r
 echo ""
 if [[ $REPLY =~ ^[Yy]$ ]]; then
-    # ssh -L "${GRADIO_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
-    # -t "echo 'Port forwarding active: localhost:${GRADIO_PORT} -> ${NODE}:${GRADIO_PORT}'; bash
     # If GRADIO_PORT is in use locally, pick a random free port
     if lsof -iTCP:"$GRADIO_PORT" -sTCP:LISTEN >/dev/null 2>&1; then
         echo "Port $GRADIO_PORT is already in use locally — selecting a free one..."
@@ -189,7 +188,7 @@ if [[ $REPLY =~ ^[Yy]$ ]]; then
     echo "Using local port: $LOCAL_PORT"
-    echo "Setting up port forwarding... Open https://localhost:${LOCAL_PORT} in your browser to access the app."
     ssh -L "${LOCAL_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
         -t "echo 'Port forwarding active: localhost:${LOCAL_PORT} -> ${NODE}:${GRADIO_PORT}'; bash"
@@ -199,7 +198,6 @@ if [[ $REPLY =~ ^[Yy]$ ]]; then
     cancel_job
 else
     echo "Skipping port forwarding."
 # Connection info
 cat <<EOF
@@ -227,7 +225,4 @@ EOF
     echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST"
 fi
-echo ""
-echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE"
-# echo "Don't forget to cancel it when done: ssh $CLUSTER_USER@$CLUSTER_HOST 'scancel $JOB_ID'"

 NODES=1
 NTASKS_PER_NODE=1
 CPUS_PER_TASK=4
+### request more memory to run on more models
+MEM="16G"
 TIME="02:00:00"
 GRADIO_PORT=7861
 script_location="$APP_DIR/$SCRIPT_NAME"
 # Activate virtual environment
 source "${ENV_PATH}"
+echo $HF_TOKEN
+hf auth login --token $HF_TOKEN
+hf auth whoami
 # Set up environment
 export GRADIO_SERVER_NAME="0.0.0.0"
 # Start Gradio app
 echo "Starting Gradio app on port ${GRADIO_PORT}..."
 gradio "${APP_PATH}" --watch-dirs "${APP_DIR}"
+# python "${APP_PATH}" --watch-dirs "${APP_DIR}"
 # Keep the job alive
 echo "Gradio app finished at: \$(date)"
 fi
 echo "Submitting job to cluster..."
 JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" \
     "bash -l -c 'cd \"$APP_DIR\" && sbatch --parsable \"$script_location\"'" \
 # Check if Gradio is actually running
 echo "Checking if Gradio app started successfully..."
 GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
 # Get NODE locally
 cancel_job() {
     read -p "Would you like to cancel the job? (y/n): " -n 1 -r
     if [[ $REPLY =~ ^[Yy]$ ]]; then
         ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel  ${JOB_ID} '"
+    else
+        echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE"
     fi
 }
 read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r
 echo ""
 if [[ $REPLY =~ ^[Yy]$ ]]; then
     # If GRADIO_PORT is in use locally, pick a random free port
     if lsof -iTCP:"$GRADIO_PORT" -sTCP:LISTEN >/dev/null 2>&1; then
         echo "Port $GRADIO_PORT is already in use locally — selecting a free one..."
     echo "Using local port: $LOCAL_PORT"
+    echo "Setting up port forwarding... Open http://localhost:${LOCAL_PORT} in your browser to access the app."
     ssh -L "${LOCAL_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
         -t "echo 'Port forwarding active: localhost:${LOCAL_PORT} -> ${NODE}:${GRADIO_PORT}'; bash"
     cancel_job
 else
     echo "Skipping port forwarding."
 # Connection info
 cat <<EOF
     echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST"
 fi