Spaces:

BinKhoaLe1812
/

Medical-Chatbot

Running

App Files Files Community

LiamKhoaLe commited on Mar 10

Commit

9f75635

1 Parent(s): 70ce1b2

reduce model with half-precision (float16) to reduce RAM usage

Browse files

Files changed (4) hide show

Dockerfile +3 -0
app.py +7 -23
requirements.txt +2 -1
warmup.py +8 -0

Dockerfile CHANGED Viewed

@@ -25,6 +25,9 @@ RUN mkdir -p /app/model_cache /home/user/.cache/huggingface/sentence-transformer
 # Run the model download script
 RUN python /app/download_model.py
 # Ensure ownership and permissions remain intact
 RUN chown -R user:user /app/model_cache

 # Run the model download script
 RUN python /app/download_model.py
+# Pre-load model in a separate script
+RUN python /app/download_model.py && python /app/warmup.py
 # Ensure ownership and permissions remain intact
 RUN chown -R user:user /app/model_cache

app.py CHANGED Viewed

@@ -52,39 +52,23 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # 2. Setup Hugging Face Cloud project model cache
 MODEL_CACHE_DIR = "/app/model_cache"
-# Verify structure
-print("\n📂 LLM Model Structure (Application Level):")
-for root, dirs, files in os.walk(MODEL_CACHE_DIR):
-    print(f"📁 {root}/")
-    for file in files:
-        print(f"  📄 {file}")
 # Ensure all necessary files exist
 required_files = ["config.json", "pytorch_model.bin", "tokenizer.json", "1_Pooling/config.json"]
 for f in required_files:
     if not os.path.exists(os.path.join(MODEL_CACHE_DIR, f)):
         print(f"❌ Missing required model file: {f}")
         exit(1)
-# Check if the required model files exist
-snapshots_path = os.path.join(MODEL_CACHE_DIR, "models--sentence-transformers--all-MiniLM-L6-v2/snapshots")
-if os.path.exists(snapshots_path):
-    snapshot_folders = os.listdir(snapshots_path)
-    if snapshot_folders:
-        model_loc = os.path.join(snapshots_path, snapshot_folders[0])
-        print(f"✅ Found model snapshot at {model_loc}")
-    else:
-        print("❌ No snapshot folder found!")
-        exit(1)
-else:
-    print("❌ No snapshots directory found! Reload ...")
-    exit(1)
-# 3. Load the model to application
 from sentence_transformers import SentenceTransformer
-print("📥 **Loading Embedding Model...**")
 start_time = time.time()
 try:
-    embedding_model = SentenceTransformer(model_loc, device="cpu")
-    print("✅ Model loaded successfully in {:.2f} seconds.".format(time.time() - start_time))
 except Exception as e:
     print(f"❌ Error loading model: {e}")
     exit(1)

 # 2. Setup Hugging Face Cloud project model cache
 MODEL_CACHE_DIR = "/app/model_cache"
 # Ensure all necessary files exist
 required_files = ["config.json", "pytorch_model.bin", "tokenizer.json", "1_Pooling/config.json"]
 for f in required_files:
     if not os.path.exists(os.path.join(MODEL_CACHE_DIR, f)):
         print(f"❌ Missing required model file: {f}")
         exit(1)
+# 3. Use the preloaded model from `warmup.py`
 from sentence_transformers import SentenceTransformer
+import torch
+print("📥 **Using Preloaded Embedding Model from Warm-up...**")
 start_time = time.time()
 try:
+    embedding_model = SentenceTransformer(MODEL_CACHE_DIR, device="cpu")
+    embedding_model = embedding_model.half()  # Ensure it stays quantized
+    embedding_model.to(torch.device("cpu"))
+    print("✅ Model ready in {:.2f} seconds.".format(time.time() - start_time))
 except Exception as e:
     print(f"❌ Error loading model: {e}")
     exit(1)

requirements.txt CHANGED Viewed

@@ -10,11 +10,12 @@ sentence-transformers
 # datasets          # Expect to load from Mongo only, no need to fetch dataset from HuggingFace unless re-embedding
 # googletrans       # Translate and process multi-language with LLM already
 # **Environment**
-python-dotenv     # Not used in Streamlit deployment
 pymongo
 # **Deployment**
 uvicorn
 fastapi
 # gradio            # On Huggingface deployment with gradio or serving FastAPI only
 # streamlit         # On streamlit deployment with daemon
 # requests

 # datasets          # Expect to load from Mongo only, no need to fetch dataset from HuggingFace unless re-embedding
 # googletrans       # Translate and process multi-language with LLM already
 # **Environment**
+python-dotenv       # Not used in Streamlit deployment
 pymongo
 # **Deployment**
 uvicorn
 fastapi
+torch               # Reduce model load with half-precision (float16) to reduce RAM usage
 # gradio            # On Huggingface deployment with gradio or serving FastAPI only
 # streamlit         # On streamlit deployment with daemon
 # requests

warmup.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from sentence_transformers import SentenceTransformer
+import torch
+print("🚀 Warming up model...")
+embedding_model = SentenceTransformer("/app/model_cache", device="cpu")
+embedding_model = embedding_model.half()  # Reduce memory
+embedding_model.to(torch.device("cpu"))
+print("✅ Model warm-up complete!")