Spaces:

pradeepsengarr
/

Custom_Rag_Bot

Running

App Files Files Community

pradeepsengarr commited on Jun 7

Commit

a8283c8

verified ·

1 Parent(s): 3529e03

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -74

app.py CHANGED Viewed

@@ -27,56 +27,62 @@ class DocumentRAG:
         self.is_indexed = False
     def setup_llm(self):
-    """Setup quantized Mistral model"""
-    try:
-        # Check if CUDA is available
-        if not torch.cuda.is_available():
-            print("⚠️ CUDA not available, falling back to CPU or alternative model")
             self.setup_fallback_model()
-            return
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type="nf4"
-        )
-        model_name = "mistralai/Mistral-7B-Instruct-v0.1"
-        # Load tokenizer first
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_name,
-            trust_remote_code=True
-        )
-        # Fix padding token issue
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-        # Load model with quantization
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            quantization_config=quantization_config,
-            device_map="auto",
-            torch_dtype=torch.float16,
-            trust_remote_code=True,
-            low_cpu_mem_usage=True  # Added for better memory management
-        )
-        print("✅ Quantized Mistral model loaded successfully")
-    except Exception as e:
-        print(f"❌ Error loading model: {e}")
-        print("🔄 Falling back to alternative model...")
-        self.setup_fallback_model()
     def setup_fallback_model(self):
         """Fallback to smaller model if Mistral fails"""
         try:
-            model_name = "microsoft/DialoGPT-small"
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.model = AutoModelForCausalLM.from_pretrained(model_name)
             print("✅ Fallback model loaded")
         except Exception as e:
             print(f"❌ Fallback model failed: {e}")
@@ -135,21 +141,35 @@ class DocumentRAG:
             except Exception as e2:
                 return f"Error reading TXT: {str(e2)}"
-    def chunk_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
-        """Split text into overlapping chunks"""
         if not text.strip():
             return []
-        words = text.split()
         chunks = []
-        for i in range(0, len(words), chunk_size - overlap):
-            chunk = ' '.join(words[i:i + chunk_size])
-            if chunk.strip():
-                chunks.append(chunk.strip())
-            if i + chunk_size >= len(words):
-                break
         return chunks
@@ -205,7 +225,7 @@ class DocumentRAG:
         except Exception as e:
             return f"❌ Error processing documents: {str(e)}"
-    def retrieve_context(self, query: str, k: int = 3) -> str:
         """Retrieve relevant context for the query"""
         if not self.is_indexed:
             return ""
@@ -218,10 +238,10 @@ class DocumentRAG:
             # Search for similar chunks
             scores, indices = self.index.search(query_embedding.astype('float32'), k)
-            # Get relevant documents
             relevant_docs = []
             for i, idx in enumerate(indices[0]):
-                if idx < len(self.documents) and scores[0][i] > 0.1:  # Similarity threshold
                     relevant_docs.append(self.documents[idx])
             return "\n\n".join(relevant_docs)
@@ -231,52 +251,73 @@ class DocumentRAG:
             return ""
     def generate_answer(self, query: str, context: str) -> str:
-        """Generate answer using the LLM"""
         if self.model is None or self.tokenizer is None:
             return "❌ Model not available. Please try again."
         try:
-            # Create prompt
-            prompt = f"""<s>[INST] Based on the following context, answer the question. If the answer is not in the context, say "I don't have enough information to answer this question."
 Context:
-{context[:2000]}  # Limit context length
 Question: {query}
-Answer: [/INST]"""
-            # Tokenize
             inputs = self.tokenizer(
                 prompt,
                 return_tensors="pt",
-                max_length=1024,
                 truncation=True,
                 padding=True
             )
-            # Generate
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
-                    max_new_tokens=256,
-                    temperature=0.7,
                     do_sample=True,
-                    top_p=0.9,
-                    pad_token_id=self.tokenizer.eos_token_id,
                     eos_token_id=self.tokenizer.eos_token_id
                 )
             # Decode response
             full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Extract answer (remove the prompt part)
-            if "[/INST]" in full_response:
                 answer = full_response.split("[/INST]")[-1].strip()
             else:
                 answer = full_response[len(prompt):].strip()
-            return answer if answer else "I couldn't generate a proper response."
         except Exception as e:
             return f"❌ Error generating answer: {str(e)}"
@@ -294,12 +335,16 @@ Answer: [/INST]"""
             context = self.retrieve_context(query)
             if not context:
-                return "🔍 No relevant information found in the uploaded documents."
             # Generate answer
             answer = self.generate_answer(query, context)
-            return f"💡 **Answer:** {answer}\n\n📄 **Source Context:** {context[:500]}..."
         except Exception as e:
             return f"❌ Error answering question: {str(e)}"
@@ -355,7 +400,7 @@ def create_interface():
                 with gr.Column():
                     answer_output = gr.Textbox(
                         label="Answer",
-                        lines=10,
                         interactive=False
                     )
@@ -372,6 +417,7 @@ def create_interface():
             - Can you summarize the key points?
             - What are the conclusions mentioned?
             - Are there any specific numbers or statistics?
             """)
     return demo

         self.is_indexed = False
     def setup_llm(self):
+        """Setup quantized Mistral model"""
+        try:
+            # Check if CUDA is available
+            if not torch.cuda.is_available():
+                print("⚠️ CUDA not available, falling back to CPU or alternative model")
+                self.setup_fallback_model()
+                return
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4"
+            )
+            model_name = "mistralai/Mistral-7B-Instruct-v0.1"
+            # Load tokenizer first
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                trust_remote_code=True
+            )
+            # Fix padding token issue
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            # Load model with quantization
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                quantization_config=quantization_config,
+                device_map="auto",
+                torch_dtype=torch.float16,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+            print("✅ Quantized Mistral model loaded successfully")
+        except Exception as e:
+            print(f"❌ Error loading model: {e}")
+            print("🔄 Falling back to alternative model...")
             self.setup_fallback_model()
     def setup_fallback_model(self):
         """Fallback to smaller model if Mistral fails"""
         try:
+            # Use a better fallback model for Q&A
+            model_name = "distilgpt2"
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.model = AutoModelForCausalLM.from_pretrained(model_name)
+            # Fix padding token for fallback model too
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
             print("✅ Fallback model loaded")
         except Exception as e:
             print(f"❌ Fallback model failed: {e}")
             except Exception as e2:
                 return f"Error reading TXT: {str(e2)}"
+    def chunk_text(self, text: str, chunk_size: int = 300, overlap: int = 50) -> List[str]:
+        """Split text into overlapping chunks with better sentence preservation"""
         if not text.strip():
             return []
+        # Split by sentences first, then group into chunks
+        sentences = text.replace('\n', ' ').split('. ')
         chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            # Add sentence to current chunk
+            test_chunk = current_chunk + ". " + sentence if current_chunk else sentence
+            # If chunk gets too long, save it and start new one
+            if len(test_chunk.split()) > chunk_size:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = sentence
+            else:
+                current_chunk = test_chunk
+        # Add the last chunk
+        if current_chunk:
+            chunks.append(current_chunk.strip())
         return chunks
         except Exception as e:
             return f"❌ Error processing documents: {str(e)}"
+    def retrieve_context(self, query: str, k: int = 5) -> str:
         """Retrieve relevant context for the query"""
         if not self.is_indexed:
             return ""
             # Search for similar chunks
             scores, indices = self.index.search(query_embedding.astype('float32'), k)
+            # Get relevant documents with higher threshold
             relevant_docs = []
             for i, idx in enumerate(indices[0]):
+                if idx < len(self.documents) and scores[0][i] > 0.2:  # Higher similarity threshold
                     relevant_docs.append(self.documents[idx])
             return "\n\n".join(relevant_docs)
             return ""
     def generate_answer(self, query: str, context: str) -> str:
+        """Generate answer using the LLM with improved prompting"""
         if self.model is None or self.tokenizer is None:
             return "❌ Model not available. Please try again."
         try:
+            # Check if using Mistral (has specific prompt format) or fallback model
+            model_name = getattr(self.model.config, '_name_or_path', '').lower()
+            is_mistral = 'mistral' in model_name
+            if is_mistral:
+                # Mistral-specific prompt format
+                prompt = f"""<s>[INST] You are a helpful assistant that answers questions based on the provided context. Use only the information from the context to answer. If the information is not in the context, say "I don't have enough information to answer this question."
 Context:
+{context[:1500]}
+Question: {query}
+Provide a clear and concise answer based only on the context above. [/INST]"""
+            else:
+                # Generic prompt for fallback models
+                prompt = f"""Context: {context[:1000]}
 Question: {query}
+Answer based on the context:"""
+            # Tokenize with proper handling
             inputs = self.tokenizer(
                 prompt,
                 return_tensors="pt",
+                max_length=800,  # Reduced to fit in memory
                 truncation=True,
                 padding=True
             )
+            # Move to same device as model
+            if torch.cuda.is_available() and next(self.model.parameters()).is_cuda:
+                inputs = {k: v.cuda() for k, v in inputs.items()}
+            # Generate with better parameters
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
+                    max_new_tokens=150,  # Reduced for more focused answers
+                    temperature=0.3,     # Lower temperature for more consistent answers
                     do_sample=True,
+                    top_p=0.8,
+                    repetition_penalty=1.1,
+                    pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id
                 )
             # Decode response
             full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract answer based on model type
+            if is_mistral and "[/INST]" in full_response:
                 answer = full_response.split("[/INST]")[-1].strip()
             else:
+                # For other models, remove the prompt
                 answer = full_response[len(prompt):].strip()
+            # Clean up the answer
+            answer = answer.replace(prompt, "").strip()
+            return answer if answer else "I couldn't generate a proper response based on the context."
         except Exception as e:
             return f"❌ Error generating answer: {str(e)}"
             context = self.retrieve_context(query)
             if not context:
+                return "🔍 No relevant information found in the uploaded documents for your question."
             # Generate answer
             answer = self.generate_answer(query, context)
+            # Format the response
+            if answer and not answer.startswith("❌"):
+                return f"💡 **Answer:** {answer}\n\n📄 **Relevant Context:**\n{context[:400]}..."
+            else:
+                return answer
         except Exception as e:
             return f"❌ Error answering question: {str(e)}"
                 with gr.Column():
                     answer_output = gr.Textbox(
                         label="Answer",
+                        lines=12,
                         interactive=False
                     )
             - Can you summarize the key points?
             - What are the conclusions mentioned?
             - Are there any specific numbers or statistics?
+            - Who are the main people or organizations mentioned?
             """)
     return demo