LiamKhoaLe commited on
Commit
9f75635
·
1 Parent(s): 70ce1b2

reduce model with half-precision (float16) to reduce RAM usage

Browse files
Files changed (4) hide show
  1. Dockerfile +3 -0
  2. app.py +7 -23
  3. requirements.txt +2 -1
  4. warmup.py +8 -0
Dockerfile CHANGED
@@ -25,6 +25,9 @@ RUN mkdir -p /app/model_cache /home/user/.cache/huggingface/sentence-transformer
25
  # Run the model download script
26
  RUN python /app/download_model.py
27
 
 
 
 
28
  # Ensure ownership and permissions remain intact
29
  RUN chown -R user:user /app/model_cache
30
 
 
25
  # Run the model download script
26
  RUN python /app/download_model.py
27
 
28
+ # Pre-load model in a separate script
29
+ RUN python /app/download_model.py && python /app/warmup.py
30
+
31
  # Ensure ownership and permissions remain intact
32
  RUN chown -R user:user /app/model_cache
33
 
app.py CHANGED
@@ -52,39 +52,23 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
52
 
53
  # 2. Setup Hugging Face Cloud project model cache
54
  MODEL_CACHE_DIR = "/app/model_cache"
55
- # Verify structure
56
- print("\n📂 LLM Model Structure (Application Level):")
57
- for root, dirs, files in os.walk(MODEL_CACHE_DIR):
58
- print(f"📁 {root}/")
59
- for file in files:
60
- print(f" 📄 {file}")
61
  # Ensure all necessary files exist
62
  required_files = ["config.json", "pytorch_model.bin", "tokenizer.json", "1_Pooling/config.json"]
63
  for f in required_files:
64
  if not os.path.exists(os.path.join(MODEL_CACHE_DIR, f)):
65
  print(f"❌ Missing required model file: {f}")
66
  exit(1)
67
- # Check if the required model files exist
68
- snapshots_path = os.path.join(MODEL_CACHE_DIR, "models--sentence-transformers--all-MiniLM-L6-v2/snapshots")
69
- if os.path.exists(snapshots_path):
70
- snapshot_folders = os.listdir(snapshots_path)
71
- if snapshot_folders:
72
- model_loc = os.path.join(snapshots_path, snapshot_folders[0])
73
- print(f"✅ Found model snapshot at {model_loc}")
74
- else:
75
- print("❌ No snapshot folder found!")
76
- exit(1)
77
- else:
78
- print("❌ No snapshots directory found! Reload ...")
79
- exit(1)
80
 
81
- # 3. Load the model to application
82
  from sentence_transformers import SentenceTransformer
83
- print("📥 **Loading Embedding Model...**")
 
84
  start_time = time.time()
85
  try:
86
- embedding_model = SentenceTransformer(model_loc, device="cpu")
87
- print("✅ Model loaded successfully in {:.2f} seconds.".format(time.time() - start_time))
 
 
88
  except Exception as e:
89
  print(f"❌ Error loading model: {e}")
90
  exit(1)
 
52
 
53
  # 2. Setup Hugging Face Cloud project model cache
54
  MODEL_CACHE_DIR = "/app/model_cache"
 
 
 
 
 
 
55
  # Ensure all necessary files exist
56
  required_files = ["config.json", "pytorch_model.bin", "tokenizer.json", "1_Pooling/config.json"]
57
  for f in required_files:
58
  if not os.path.exists(os.path.join(MODEL_CACHE_DIR, f)):
59
  print(f"❌ Missing required model file: {f}")
60
  exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ # 3. Use the preloaded model from `warmup.py`
63
  from sentence_transformers import SentenceTransformer
64
+ import torch
65
+ print("📥 **Using Preloaded Embedding Model from Warm-up...**")
66
  start_time = time.time()
67
  try:
68
+ embedding_model = SentenceTransformer(MODEL_CACHE_DIR, device="cpu")
69
+ embedding_model = embedding_model.half() # Ensure it stays quantized
70
+ embedding_model.to(torch.device("cpu"))
71
+ print("✅ Model ready in {:.2f} seconds.".format(time.time() - start_time))
72
  except Exception as e:
73
  print(f"❌ Error loading model: {e}")
74
  exit(1)
requirements.txt CHANGED
@@ -10,11 +10,12 @@ sentence-transformers
10
  # datasets # Expect to load from Mongo only, no need to fetch dataset from HuggingFace unless re-embedding
11
  # googletrans # Translate and process multi-language with LLM already
12
  # **Environment**
13
- python-dotenv # Not used in Streamlit deployment
14
  pymongo
15
  # **Deployment**
16
  uvicorn
17
  fastapi
 
18
  # gradio # On Huggingface deployment with gradio or serving FastAPI only
19
  # streamlit # On streamlit deployment with daemon
20
  # requests
 
10
  # datasets # Expect to load from Mongo only, no need to fetch dataset from HuggingFace unless re-embedding
11
  # googletrans # Translate and process multi-language with LLM already
12
  # **Environment**
13
+ python-dotenv # Not used in Streamlit deployment
14
  pymongo
15
  # **Deployment**
16
  uvicorn
17
  fastapi
18
+ torch # Reduce model load with half-precision (float16) to reduce RAM usage
19
  # gradio # On Huggingface deployment with gradio or serving FastAPI only
20
  # streamlit # On streamlit deployment with daemon
21
  # requests
warmup.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import torch
3
+
4
+ print("🚀 Warming up model...")
5
+ embedding_model = SentenceTransformer("/app/model_cache", device="cpu")
6
+ embedding_model = embedding_model.half() # Reduce memory
7
+ embedding_model.to(torch.device("cpu"))
8
+ print("✅ Model warm-up complete!")