Spaces:

raksama19
/

Test-Dolphin-PDF

Runtime error

App Files Files Community

raksama19 commited on Jul 15

Commit

42f533f

verified ·

1 Parent(s): 12128c1

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -78

app.py CHANGED Viewed

@@ -313,23 +313,20 @@ model_path = "./hf_model"
 if not os.path.exists(model_path):
     model_path = "ByteDance/DOLPHIN"
-try:
-    dolphin_model = DOLPHIN(model_path)
-    print(f"Model loaded successfully from {model_path}")
-    model_status = f"✅ Model ready (Device: {dolphin_model.device})"
-except Exception as e:
-    print(f"Error loading model: {e}")
-    dolphin_model = None
-    model_status = f"❌ Model failed to load: {str(e)}"
-# Initialize embedding model for RAG
 if RAG_DEPENDENCIES_AVAILABLE:
     try:
         print("Loading embedding model for RAG...")
-        # Use GPU for embedding model with 24GB VRAM
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
-        print(f"✅ Embedding model loaded successfully ({device})")
     except Exception as e:
         print(f"❌ Error loading embedding model: {e}")
         import traceback
@@ -339,39 +336,94 @@ else:
     print("❌ RAG dependencies not available")
     embedding_model = None
-# Initialize chatbot model
-try:
-    import os
-    # Get HuggingFace token from environment/secrets
-    hf_token = os.getenv('HF_TOKEN')
-    print(f"HF_TOKEN found: {'Yes' if hf_token else 'No'}")
-    if hf_token:
-        print("Loading chatbot model with token...")
-        chatbot_model = Gemma3nForConditionalGeneration.from_pretrained(
-            "google/gemma-3n-e4b-it",
-            device_map="auto",
-            torch_dtype=torch.bfloat16,
-            token=hf_token  # Use 'token' instead of 'use_auth_token'
-        ).eval()
-        chatbot_processor = AutoProcessor.from_pretrained(
-            "google/gemma-3n-e4b-it",
-            token=hf_token  # Use 'token' instead of 'use_auth_token'
-        )
-        print("✅ Chatbot model loaded successfully")
-    else:
-        print("❌ No HF_TOKEN found in environment")
         chatbot_model = None
         chatbot_processor = None
-except Exception as e:
-    print(f"❌ Error loading chatbot model: {e}")
-    import traceback
-    traceback.print_exc()
-    chatbot_model = None
-    chatbot_processor = None
 # Global state for managing tabs
@@ -380,10 +432,15 @@ show_results_tab = False
 document_chunks = []
 document_embeddings = None
 embedding_model = None
-# chatbot_model is initialized above
-def chunk_document(text, chunk_size=500, overlap=50):
     """Split document into overlapping chunks for RAG"""
     words = text.split()
     chunks = []
@@ -401,8 +458,8 @@ def create_embeddings(chunks):
         return None
     try:
-        # Process in larger batches with 24GB GPU
-        batch_size = 64
         embeddings = []
         for i in range(0, len(chunks), batch_size):
@@ -415,10 +472,10 @@ def create_embeddings(chunks):
         print(f"Error creating embeddings: {e}")
         return None
-def retrieve_relevant_chunks(question, chunks, embeddings, top_k=3):
     """Retrieve most relevant chunks for a question"""
     if embedding_model is None or embeddings is None:
-        return chunks[:3]  # Fallback to first 3 chunks
     try:
         question_embedding = embedding_model.encode([question], show_progress_bar=False)
@@ -431,32 +488,43 @@ def retrieve_relevant_chunks(question, chunks, embeddings, top_k=3):
         return relevant_chunks
     except Exception as e:
         print(f"Error retrieving chunks: {e}")
-        return chunks[:3]  # Fallback
 def process_uploaded_pdf(pdf_file, progress=gr.Progress()):
     """Main processing function for uploaded PDF"""
     global processed_markdown, show_results_tab, document_chunks, document_embeddings
-    if dolphin_model is None:
-        return "❌ Model not loaded", gr.Tabs(visible=False)
     if pdf_file is None:
         return "❌ No PDF uploaded", gr.Tabs(visible=False)
     try:
-        combined_markdown, status = process_pdf_document(pdf_file, dolphin_model, progress)
         if status == "processing_complete":
             processed_markdown = combined_markdown
             # Create chunks and embeddings for RAG
-            print("Creating document chunks for RAG...")
             document_chunks = chunk_document(processed_markdown)
             document_embeddings = create_embeddings(document_chunks)
             print(f"Created {len(document_chunks)} chunks")
             show_results_tab = True
-            return "✅ PDF processed successfully! Check the 'Document' tab above.", gr.Tabs(visible=True)
         else:
             show_results_tab = False
             return combined_markdown, gr.Tabs(visible=False)
@@ -481,7 +549,11 @@ def clear_all():
     document_chunks = []
     document_embeddings = None
-    return None, "", gr.Tabs(visible=False)
 # Create Gradio interface
@@ -535,14 +607,14 @@ with gr.Blocks(
     with gr.Tabs() as main_tabs:
         # Home Tab
         with gr.TabItem("🏠 Home", id="home"):
-            chatbot_status = "✅ Chatbot ready" if chatbot_model else "❌ Chatbot not loaded"
             embedding_status = "✅ RAG ready" if embedding_model else "❌ RAG not loaded"
             gr.Markdown(
                 "# Scholar Express\n"
-                "### Upload a research paper to get a web-friendly version, an AI chatbot, and a podcast summary. Because of our reliance on Generative AI, some errors are inevitable.\n"
-                f"**PDF Processing:** {model_status}\n"
-                f"**Chatbot:** {chatbot_status}\n"
-                f"**RAG System:** {embedding_status}"
             )
             with gr.Column(elem_classes="upload-container"):
@@ -647,56 +719,60 @@ with gr.Blocks(
         if not message.strip():
             return history
-        if chatbot_model is None:
-            return history + [[message, "❌ Chatbot model not loaded. Please check your HuggingFace token."]]
         if not processed_markdown:
             return history + [[message, "❌ Please process a PDF document first before asking questions."]]
         try:
             # Use RAG to get relevant chunks instead of full document
             if document_chunks and len(document_chunks) > 0:
                 relevant_chunks = retrieve_relevant_chunks(message, document_chunks, document_embeddings)
                 context = "\n\n".join(relevant_chunks)
             else:
                 # Fallback to truncated document if RAG fails
-                context = processed_markdown[:1500] + "..." if len(processed_markdown) > 1500 else processed_markdown
-            # Create chat messages
             messages = [
                 {
                     "role": "system",
-                    "content": [{"type": "text", "text": "You are a helpful assistant that answers questions about documents. Use the provided document content to answer questions accurately and concisely."}]
                 },
                 {
                     "role": "user",
-                    "content": [{"type": "text", "text": f"Document content:\n{context}\n\nQuestion: {message}"}]
                 }
             ]
             # Process with the model
-            inputs = chatbot_processor.apply_chat_template(
                 messages,
                 add_generation_prompt=True,
                 tokenize=True,
                 return_dict=True,
                 return_tensors="pt",
-            ).to(chatbot_model.device)
             input_len = inputs["input_ids"].shape[-1]
             with torch.inference_mode():
-                generation = chatbot_model.generate(
                     **inputs,
-                    max_new_tokens=400,  # Increased for 24GB GPU
                     do_sample=False,
                     temperature=0.7,
-                    pad_token_id=chatbot_processor.tokenizer.pad_token_id,
-                    use_cache=True  # Enable KV cache with more VRAM
                 )
                 generation = generation[0][input_len:]
-            response = chatbot_processor.decode(generation, skip_special_tokens=True)
             return history + [[message, response]]

 if not os.path.exists(model_path):
     model_path = "ByteDance/DOLPHIN"
+# Model paths and configuration
+model_path = "./hf_model" if os.path.exists("./hf_model") else "ByteDance/DOLPHIN"
+hf_token = os.getenv('HF_TOKEN')
+# Don't load models initially - load them on demand
+model_status = "✅ Models ready (Dynamic loading)"
+# Initialize embedding model for RAG (CPU to save GPU memory)
 if RAG_DEPENDENCIES_AVAILABLE:
     try:
         print("Loading embedding model for RAG...")
+        # Use CPU for embedding model to save GPU memory for main models
+        embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
+        print("✅ Embedding model loaded successfully (CPU)")
     except Exception as e:
         print(f"❌ Error loading embedding model: {e}")
         import traceback
     print("❌ RAG dependencies not available")
     embedding_model = None
+# Model management functions
+def load_dolphin_model():
+    """Load DOLPHIN model for PDF processing"""
+    global dolphin_model, current_model
+    if current_model == "dolphin":
+        return dolphin_model
+    # Unload chatbot model if loaded
+    unload_chatbot_model()
+    try:
+        print("Loading DOLPHIN model...")
+        dolphin_model = DOLPHIN(model_path)
+        current_model = "dolphin"
+        print(f"✅ DOLPHIN model loaded (Device: {dolphin_model.device})")
+        return dolphin_model
+    except Exception as e:
+        print(f"❌ Error loading DOLPHIN model: {e}")
+        return None
+def unload_dolphin_model():
+    """Unload DOLPHIN model to free memory"""
+    global dolphin_model, current_model
+    if dolphin_model is not None:
+        print("Unloading DOLPHIN model...")
+        del dolphin_model
+        dolphin_model = None
+        if current_model == "dolphin":
+            current_model = None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        print("✅ DOLPHIN model unloaded")
+def load_chatbot_model():
+    """Load Gemma chatbot model"""
+    global chatbot_model, chatbot_processor, current_model
+    if current_model == "chatbot":
+        return chatbot_model, chatbot_processor
+    # Unload DOLPHIN model if loaded
+    unload_dolphin_model()
+    try:
+        print("Loading Gemma chatbot model...")
+        print(f"HF_TOKEN found: {'Yes' if hf_token else 'No'}")
+        if hf_token:
+            chatbot_model = Gemma3nForConditionalGeneration.from_pretrained(
+                "google/gemma-3n-e4b-it",
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
+                token=hf_token
+            ).eval()
+            chatbot_processor = AutoProcessor.from_pretrained(
+                "google/gemma-3n-e4b-it",
+                token=hf_token
+            )
+            current_model = "chatbot"
+            print("✅ Gemma chatbot model loaded")
+            return chatbot_model, chatbot_processor
+        else:
+            print("❌ No HF_TOKEN found")
+            return None, None
+    except Exception as e:
+        print(f"❌ Error loading chatbot model: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, None
+def unload_chatbot_model():
+    """Unload chatbot model to free memory"""
+    global chatbot_model, chatbot_processor, current_model
+    if chatbot_model is not None:
+        print("Unloading Gemma chatbot model...")
+        del chatbot_model, chatbot_processor
         chatbot_model = None
         chatbot_processor = None
+        if current_model == "chatbot":
+            current_model = None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        print("✅ Gemma chatbot model unloaded")
 # Global state for managing tabs
 document_chunks = []
 document_embeddings = None
 embedding_model = None
+# Global model state - only one model loaded at a time
+dolphin_model = None
+chatbot_model = None
+chatbot_processor = None
+current_model = None  # Track which model is currently loaded
+def chunk_document(text, chunk_size=400, overlap=40):
     """Split document into overlapping chunks for RAG"""
     words = text.split()
     chunks = []
         return None
     try:
+        # Process in smaller batches on CPU
+        batch_size = 32
         embeddings = []
         for i in range(0, len(chunks), batch_size):
         print(f"Error creating embeddings: {e}")
         return None
+def retrieve_relevant_chunks(question, chunks, embeddings, top_k=2):
     """Retrieve most relevant chunks for a question"""
     if embedding_model is None or embeddings is None:
+        return chunks[:2]  # Fallback to first 2 chunks
     try:
         question_embedding = embedding_model.encode([question], show_progress_bar=False)
         return relevant_chunks
     except Exception as e:
         print(f"Error retrieving chunks: {e}")
+        return chunks[:2]  # Fallback
 def process_uploaded_pdf(pdf_file, progress=gr.Progress()):
     """Main processing function for uploaded PDF"""
     global processed_markdown, show_results_tab, document_chunks, document_embeddings
     if pdf_file is None:
         return "❌ No PDF uploaded", gr.Tabs(visible=False)
     try:
+        # Load DOLPHIN model for PDF processing
+        progress(0.1, desc="Loading DOLPHIN model...")
+        dolphin = load_dolphin_model()
+        if dolphin is None:
+            return "❌ Failed to load DOLPHIN model", gr.Tabs(visible=False)
+        # Process PDF
+        progress(0.2, desc="Processing PDF...")
+        combined_markdown, status = process_pdf_document(pdf_file, dolphin, progress)
         if status == "processing_complete":
             processed_markdown = combined_markdown
             # Create chunks and embeddings for RAG
+            progress(0.9, desc="Creating document chunks for RAG...")
             document_chunks = chunk_document(processed_markdown)
             document_embeddings = create_embeddings(document_chunks)
             print(f"Created {len(document_chunks)} chunks")
+            # Unload DOLPHIN model to free memory for chatbot
+            progress(0.95, desc="Preparing chatbot...")
+            unload_dolphin_model()
             show_results_tab = True
+            progress(1.0, desc="PDF processed successfully!")
+            return "✅ PDF processed successfully! Chatbot is ready in the Chat tab.", gr.Tabs(visible=True)
         else:
             show_results_tab = False
             return combined_markdown, gr.Tabs(visible=False)
     document_chunks = []
     document_embeddings = None
+    # Unload any loaded models
+    unload_dolphin_model()
+    unload_chatbot_model()
+    return None, "✅ Ready to process your PDF", gr.Tabs(visible=False)
 # Create Gradio interface
     with gr.Tabs() as main_tabs:
         # Home Tab
         with gr.TabItem("🏠 Home", id="home"):
             embedding_status = "✅ RAG ready" if embedding_model else "❌ RAG not loaded"
+            current_status = f"Currently loaded: {current_model or 'None'}"
             gr.Markdown(
                 "# Scholar Express\n"
+                "### Upload a research paper to get a web-friendly version, an AI chatbot, and a podcast summary. Models are loaded dynamically to optimize memory usage.\n"
+                f"**System:** {model_status}\n"
+                f"**RAG System:** {embedding_status}\n"
+                f"**Status:** {current_status}"
             )
             with gr.Column(elem_classes="upload-container"):
         if not message.strip():
             return history
         if not processed_markdown:
             return history + [[message, "❌ Please process a PDF document first before asking questions."]]
         try:
+            # Load chatbot model
+            model, processor = load_chatbot_model()
+            if model is None or processor is None:
+                return history + [[message, "❌ Failed to load chatbot model. Please check your HuggingFace token."]]
             # Use RAG to get relevant chunks instead of full document
             if document_chunks and len(document_chunks) > 0:
                 relevant_chunks = retrieve_relevant_chunks(message, document_chunks, document_embeddings)
                 context = "\n\n".join(relevant_chunks)
             else:
                 # Fallback to truncated document if RAG fails
+                context = processed_markdown[:1200] + "..." if len(processed_markdown) > 1200 else processed_markdown
+            # Create chat messages with shorter context
             messages = [
                 {
                     "role": "system",
+                    "content": [{"type": "text", "text": "You are a helpful assistant. Answer questions about the document concisely."}]
                 },
                 {
                     "role": "user",
+                    "content": [{"type": "text", "text": f"Context:\n{context}\n\nQ: {message}"}]
                 }
             ]
             # Process with the model
+            inputs = processor.apply_chat_template(
                 messages,
                 add_generation_prompt=True,
                 tokenize=True,
                 return_dict=True,
                 return_tensors="pt",
+            ).to(model.device)
             input_len = inputs["input_ids"].shape[-1]
             with torch.inference_mode():
+                generation = model.generate(
                     **inputs,
+                    max_new_tokens=300,  # Can be higher now with single model
                     do_sample=False,
                     temperature=0.7,
+                    pad_token_id=processor.tokenizer.pad_token_id,
+                    use_cache=True,  # Can enable cache with single model
+                    num_beams=1
                 )
                 generation = generation[0][input_len:]
+            response = processor.decode(generation, skip_special_tokens=True)
             return history + [[message, response]]