Spaces:

BinKhoaLe1812
/

Medical-Chatbot

Running

App Files Files Community

LiamKhoaLe commited on Mar 11

Commit

4455263

1 Parent(s): 9f75635

Optimized FastAPI for Hugging Face Spaces

Browse files

Files changed (2) hide show

Dockerfile +0 -3
app.py +72 -193

Dockerfile CHANGED Viewed

@@ -22,9 +22,6 @@ ENV SENTENCE_TRANSFORMERS_HOME="/home/user/.cache/huggingface/sentence-transform
 RUN mkdir -p /app/model_cache /home/user/.cache/huggingface/sentence-transformers && \
     chown -R user:user /app/model_cache /home/user/.cache/huggingface
-# Run the model download script
-RUN python /app/download_model.py
 # Pre-load model in a separate script
 RUN python /app/download_model.py && python /app/warmup.py

 RUN mkdir -p /app/model_cache /home/user/.cache/huggingface/sentence-transformers && \
     chown -R user:user /app/model_cache /home/user/.cache/huggingface
 # Pre-load model in a separate script
 RUN python /app/download_model.py && python /app/warmup.py

app.py CHANGED Viewed

@@ -1,182 +1,89 @@
-# ==========================
-# Medical Chatbot Backend (Gemini Flash API + RAG) - Local Prebuilt Model with FAISS Index & Data Stored in MongoDB
-# ==========================
-"""
-This script loads:
-  1) A FAISS index stored in MongoDB (in the "faiss_index" collection)
-  2) A local SentenceTransformer model (downloaded via snapshot_download)
-  3) QA data (the full dataset of 256916 QA entries) stored in MongoDB (in the "qa_data" collection)
-If the QA data or FAISS index are not found in MongoDB, the script loads the full dataset from Hugging Face,
-computes embeddings for all QA pairs (concatenating the "Patient" and "Doctor" fields), and stores both the raw QA data
-and the FAISS index in MongoDB.
-The chatbot instructs Gemini Flash to format its answer using markdown.
-"""
 import os
 import faiss
 import numpy as np
 import time
 from fastapi import FastAPI
 from fastapi.responses import JSONResponse
-from pathlib import Path
-import threading
-from dotenv import load_dotenv
-# Checking status
-print("🚀 Starting the script...")
-# 🔹 Load environment variables from .env
-load_dotenv()
-gemini_flash_api_key = os.getenv("FlashAPI")
 mongo_uri = os.getenv("MONGO_URI")
 index_uri = os.getenv("INDEX_URI")
-# Verify ENV Variables
-print(f"🔎 FlashAPI Key Exists: {'✅ Yes' if os.getenv('FlashAPI') else '❌ No'}")
-print(f"🔎 MongoDB URI Exists: {'✅ Yes' if os.getenv('MONGO_URI') else '❌ No'}")
-print(f"🔎 FAISS Index URI Exists: {'✅ Yes' if os.getenv('INDEX_URI') else '❌ No'}")
-# Heartbeat status checker
-def keep_alive():
-    while True:
-        print("💓 Heartbeat: App is alive")
-        time.sleep(10)
-threading.Thread(target=keep_alive, daemon=True).start()
-# 1. Environment variables to mitigate segmentation faults
 os.environ["OMP_NUM_THREADS"] = "1"
-os.environ["MKL_NUM_THREADS"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# 2. Setup Hugging Face Cloud project model cache
-MODEL_CACHE_DIR = "/app/model_cache"
-# Ensure all necessary files exist
-required_files = ["config.json", "pytorch_model.bin", "tokenizer.json", "1_Pooling/config.json"]
-for f in required_files:
-    if not os.path.exists(os.path.join(MODEL_CACHE_DIR, f)):
-        print(f"❌ Missing required model file: {f}")
-        exit(1)
-# 3. Use the preloaded model from `warmup.py`
-from sentence_transformers import SentenceTransformer
-import torch
-print("📥 **Using Preloaded Embedding Model from Warm-up...**")
-start_time = time.time()
-try:
-    embedding_model = SentenceTransformer(MODEL_CACHE_DIR, device="cpu")
-    embedding_model = embedding_model.half()  # Ensure it stays quantized
-    embedding_model.to(torch.device("cpu"))
-    print("✅ Model ready in {:.2f} seconds.".format(time.time() - start_time))
-except Exception as e:
-    print(f"❌ Error loading model: {e}")
-    exit(1)
-##---------------------------##
-## EMBEDDING AND DATA RETRIEVAL
-##---------------------------##
-# 🔹 MongoDB Connection
-from pymongo import MongoClient
-try:
-    print("⏳ Connecting to MongoDB...")
-    # 1. QA client
-    client = MongoClient(mongo_uri)
-    db = client["MedicalChatbotDB"]  # Use your chosen database name
-    qa_collection = db["qa_data"]
-    # 2. FAISS index client
-    iclient = MongoClient(index_uri)
-    idb = iclient["MedicalChatbotDB"]  # Use your chosen database name
-    index_collection = idb["faiss_index_files"]
-except Exception as e:
-    print(f"❌ MongoDB connection failed: {e}")
-    exit(1)
-# 🔹 Load or Build QA Data in MongoDB
-print("⏳ Checking MongoDB for existing QA data...")
-if qa_collection.count_documents({}) == 0:
-    print("⚠️ QA data not found in MongoDB! Please report this to the developer.")
-else:
-    print("✅ Loaded existing QA data from MongoDB.")
-    # Use an aggregation pipeline with allowDiskUse to sort by "i" without creating an index.
-    qa_docs = list(qa_collection.aggregate([
-            {"$sort": {"i": 1}},
-            {"$project": {"_id": 0}}
-        ], allowDiskUse=True))
-    qa_data = qa_docs
-    print("📦 Total QA entries loaded:", len(qa_data))
-# 🔹 Build or Load the FAISS Index from MongoDB using GridFS (on the separate cluster)
-print("⏳ Checking GridFS for existing FAISS index...")
 import gridfs
-fs = gridfs.GridFS(idb, collection="faiss_index_files")  # 'idb' is connected using INDEX_URI
-# 1. Find the FAISS index file by filename.
 def load_faiss_index():
-    existing_file = fs.find_one({"filename": "faiss_index.bin"})
-    if existing_file is None:
-        print("⚠️ FAISS index not found in GridFS! Please report this to the developer.")
-        return None
-    else:
-        print("✅ Found FAISS index in GridFS. Loading...")
-        stored_index_bytes = existing_file.read()
-        index_bytes_np = np.frombuffer(stored_index_bytes, dtype='uint8')
-        index = faiss.deserialize_index(index_bytes_np)
-        print("📦 FAISS index loaded from GridFS successfully!")
-        return index
-# Load FAISS index only when needed
-index = None
-##---------------------------##
-## INFERENCE BACK+FRONT END
-##---------------------------##
-# 🔹 Prepare Retrieval and Chat Logic
-def retrieve_medical_info(query):
-    """Retrieve relevant medical knowledge using the FAISS index."""
     global index
     if index is None:
-        index = load_faiss_index()  # Load FAISS only on first query
     if index is None:
         return ["No medical information available."]
     query_embedding = embedding_model.encode([query], convert_to_numpy=True)
     _, idxs = index.search(query_embedding, k=3)
-    results = []
-    for i in idxs[0]:
-        if i < len(qa_data):
-            results.append(qa_data[i].get("Doctor", "No answer available."))
-        else:
-            results.append("No answer available.")
     return results
-# 🔹 Gemini Flash API Call
-from google import genai
 def gemini_flash_completion(prompt, model, temperature=0.7):
     client_genai = genai.Client(api_key=gemini_flash_api_key)
     try:
         response = client_genai.models.generate_content(model=model, contents=prompt)
         return response.text
     except Exception as e:
-        print(f"⚠️ Error calling Gemini API: {e}")
         return "Error generating response from Gemini."
-# Define a simple language mapping (modify or add more as needed)
-language_map = {
-    "EN": "English",
-    "VI": "Vietnamese",
-    "ZH": "Chinese"
-}
-# 🔹 Chatbot Class
 class RAGMedicalChatbot:
     def __init__(self, model_name, retrieve_function):
         self.model_name = model_name
@@ -185,69 +92,41 @@ class RAGMedicalChatbot:
     def chat(self, user_query, lang="EN"):
         retrieved_info = self.retrieve(user_query)
         knowledge_base = "\n".join(retrieved_info)
-        # Construct prompt for Gemini Flash
-        prompt = (
-            "Please format your answer using markdown. Use **bold** for titles, *italic* for emphasis, "
-            "and ensure that headings and paragraphs are clearly separated.\n\n"
-            f"Using the following medical knowledge:\n{knowledge_base} \n(trained with 256,916 data entries).\n\n"
-            f"Answer the following question in a professional and medically accurate manner:\n{user_query}.\n\n"
-            f"Your response answer must be in {lang} language."
-        )
-        completion = gemini_flash_completion(prompt, model=self.model_name, temperature=0.7)
-        return completion.strip()
-# 🔹 Model Class (change to others if needed)
-chatbot = RAGMedicalChatbot(
-    model_name="gemini-2.0-flash",
-    retrieve_function=retrieve_medical_info
-)
-print("✅ Medical chatbot is ready! 🤖")
-# 🔹 FastAPI Server
-# from fastapi.staticfiles import StaticFiles
-from fastapi.middleware.cors import CORSMiddleware # Bypassing CORS origin
-app = FastAPI(title="Medical Chatbot")
-# 1. Define the origins
-origins = [
-    "http://localhost:5173",            # Vite dev server
-    "http://localhost:3000",            # Another vercel dev server
-    "https://medical-chatbot-henna.vercel.app", # ✅ Vercel frontend production URL
-]
-# 2. Then add the CORS middleware:
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=origins,   # or ["*"] to allow all
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# 🔹 Chat route
 @app.post("/chat")
 async def chat_endpoint(data: dict):
     user_query = data.get("query", "")
-    lang = data.get("lang", "EN")  # Expect a language code from the request
     if not user_query:
         return JSONResponse(content={"response": "No query provided."})
     start_time = time.time()
-    response_text = chatbot.chat(user_query, lang)  # Pass language selection
     end_time = time.time()
     response_text += f"\n\n(Response time: {end_time - start_time:.2f} seconds)"
-    return JSONResponse(content={"response": response_text})
-# 🔹 Main Execution
-import uvicorn
 if __name__ == "__main__":
-    try:
-        print("✅ Server is starting...")
-        uvicorn.run(app, host="0.0.0.0", port=7860, workers=1) # Default 2 workers, cut to 1
-    except Exception as e:
-        print(f"❌ Server startup failed: {e}")
-        exit(1)

 import os
 import faiss
 import numpy as np
 import time
+import uvicorn
 from fastapi import FastAPI
 from fastapi.responses import JSONResponse
+from pymongo import MongoClient
+from google import genai
+from sentence_transformers import SentenceTransformer
+# ✅ Environment Variables
 mongo_uri = os.getenv("MONGO_URI")
 index_uri = os.getenv("INDEX_URI")
+gemini_flash_api_key = os.getenv("FlashAPI")
+if not all([gemini_flash_api_key, mongo_uri, index_uri]):
+    raise ValueError("❌ Missing API keys! Set them in Hugging Face Secrets.")
+# ✅ Reduce Memory Usage
 os.environ["OMP_NUM_THREADS"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# ✅ Initialize FastAPI app
+app = FastAPI(title="Medical Chatbot API")
+# ✅ Use Lazy Loading for FAISS Index
+index = None  # Delay FAISS Index loading until first query
+# ✅ Load SentenceTransformer Model (Quantized)
+print("📥 Loading SentenceTransformer Model...")
+MODEL_CACHE_DIR = "/app/model_cache"
+embedding_model = SentenceTransformer(MODEL_CACHE_DIR, device="cpu")
+embedding_model = embedding_model.half()  # Reduce memory usage
+# ✅ Setup MongoDB Connection
+client = MongoClient(mongo_uri)
+db = client["MedicalChatbotDB"]
+qa_collection = db["qa_data"]
+iclient = MongoClient(index_uri)
+idb = iclient["MedicalChatbotDB"]
+index_collection = idb["faiss_index_files"]
+# ✅ Load FAISS Index (Lazy Load)
 import gridfs
+fs = gridfs.GridFS(idb, collection="faiss_index_files")
 def load_faiss_index():
     global index
     if index is None:
+        print("⏳ Loading FAISS index from GridFS...")
+        existing_file = fs.find_one({"filename": "faiss_index.bin"})
+        if existing_file:
+            stored_index_bytes = existing_file.read()
+            index_bytes_np = np.frombuffer(stored_index_bytes, dtype='uint8')
+            index = faiss.deserialize_index(index_bytes_np)
+            print("✅ FAISS Index Loaded")
+        else:
+            print("❌ FAISS index not found in GridFS.")
+    return index
+# ✅ Retrieve Medical Info
+def retrieve_medical_info(query):
+    global index
+    index = load_faiss_index()  # Load FAISS on demand
     if index is None:
         return ["No medical information available."]
     query_embedding = embedding_model.encode([query], convert_to_numpy=True)
     _, idxs = index.search(query_embedding, k=3)
+    results = [qa_collection.find_one({"i": int(i)}).get("Doctor", "No answer available.") for i in idxs[0]]
     return results
+# ✅ Gemini Flash API Call
 def gemini_flash_completion(prompt, model, temperature=0.7):
     client_genai = genai.Client(api_key=gemini_flash_api_key)
     try:
         response = client_genai.models.generate_content(model=model, contents=prompt)
         return response.text
     except Exception as e:
+        print(f"❌ Error calling Gemini API: {e}")
         return "Error generating response from Gemini."
+# ✅ Chatbot Class
 class RAGMedicalChatbot:
     def __init__(self, model_name, retrieve_function):
         self.model_name = model_name
     def chat(self, user_query, lang="EN"):
         retrieved_info = self.retrieve(user_query)
         knowledge_base = "\n".join(retrieved_info)
+        # ✅ Construct Prompt
+        prompt = f"""
+        Please format your answer using markdown.
+        **Bold for titles**, *italic for emphasis*, and clear headings.
+        **Medical knowledge:**
+        {knowledge_base}
+        **Question:** {user_query}
+        **Language Required:** {lang}
+        """
+        completion = gemini_flash_completion(prompt, model=self.model_name, temperature=0.7)
+        return completion.strip()
+# ✅ Initialize Chatbot
+chatbot = RAGMedicalChatbot(model_name="gemini-2.0-flash", retrieve_function=retrieve_medical_info)
+# ✅ Chat Endpoint
 @app.post("/chat")
 async def chat_endpoint(data: dict):
     user_query = data.get("query", "")
+    lang = data.get("lang", "EN")
     if not user_query:
         return JSONResponse(content={"response": "No query provided."})
     start_time = time.time()
+    response_text = chatbot.chat(user_query, lang)
     end_time = time.time()
     response_text += f"\n\n(Response time: {end_time - start_time:.2f} seconds)"
+    return JSONResponse(content={"response": response_text})
+# ✅ Run Uvicorn with 1 Worker
 if __name__ == "__main__":
+    print("✅ Starting FastAPI Server...")
+    uvicorn.run(app, host="0.0.0.0", port=7860, workers=1)