Spaces:

ramysaidagieb
/

rag22v2

Runtime error

App Files Files Community

ramysaidagieb commited on May 22

Commit

faa82c9

verified ·

1 Parent(s): e6f0575

Update rag_pipeline.py

Browse files

Files changed (1) hide show

rag_pipeline.py +28 -39

rag_pipeline.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from sentence_transformers import CrossEncoder, SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import faiss
 import numpy as np
@@ -6,67 +6,56 @@ from typing import List, Dict
 class ArabicRAGSystem:
     def __init__(self):
-        """Initialize with fallback models for Arabic support"""
-        # Solution 1: Use reliable Arabic embedding model
-        self.embedding_model = SentenceTransformer("UBC-NLP/AraBERT")
-        # Solution 2: Fallback cross-encoder options
-        try:
-            self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")  # Multilingual fallback
-        except:
-            self.cross_encoder = None  # System will work without it
-        # Solution 3: Main Arabic LLM with error handling
         try:
             self.tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b-chat")
             self.llm = AutoModelForCausalLM.from_pretrained("inception-mbzuai/jais-13b-chat")
         except:
-            # Fallback to smaller Arabic model
-            self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")
-            self.llm = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-base")
-        self.index = faiss.IndexFlatL2(768)  # AraBERT uses 768-dim embeddings
     def generate_answer(self, question: str, documents: List[Dict],
-                      top_k: int = 5, temperature: float = 0.7) -> tuple:
-        """Enhanced with fallback retrieval methods"""
         # Index documents
         texts = [doc["text"] for doc in documents]
         self.index.add(np.array(self.embedding_model.encode(texts)))
-        # Two-phase retrieval with fallback
         query_embedding = self.embedding_model.encode([question])
-        distances, indices = self.index.search(query_embedding, top_k*2)
-        # Solution 4: Cross-encoder fallback logic
-        if self.cross_encoder:
-            pairs = [[question, documents[idx]["text"]] for idx in indices[0]]
-            scores = self.cross_encoder.predict(pairs)
-            top_indices = [indices[0][i] for i in np.argsort(scores)[-top_k:][::-1]]
-        else:
-            top_indices = indices[0][:top_k]
         # Prepare context with metadata
         context = "\n\n".join([
             f"المرجع: {documents[idx]['source']}\n"
             f"الصفحة: {documents[idx].get('page', 'N/A')}\n"
             f"النص: {documents[idx]['text']}\n"
-            for idx in top_indices
         ])
-        # Generation with error handling
         prompt = f"""
-        نظام التحليل الديني العربي:
         السياق:
         {context}
         السؤال: {question}
         التعليمات:
-        - أجب باللغة العربية الفصحى فقط
-        - استخدم المعلومات من السياق فقط
-        - أشر إلى المصادر باستخدام [المرجع: اسم الملف، الصفحة]
-        - إذا لم تجد إجابة واضحة قل "لا تتوفر معلومات كافية"
         الإجابة:
         """.strip()
@@ -82,15 +71,15 @@ class ArabicRAGSystem:
             )
             answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             answer = answer.split("الإجابة:")[-1].strip()
-        except:
-            answer = "عذراً، حدث خطأ في معالجة السؤال. يرجى المحاولة مرة أخرى."
         # Prepare sources
         sources = [{
             "text": documents[idx]["text"],
             "source": documents[idx]["source"],
             "page": documents[idx].get("page", "N/A"),
-            "score": float(1 - distances[0][i]) if self.cross_encoder else 0.0
-        } for i, idx in enumerate(top_indices)]
         return answer, sources

+from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import faiss
 import numpy as np
 class ArabicRAGSystem:
     def __init__(self):
+        """Initialize with guaranteed-accessible Arabic models"""
+        # Verified embedding models (publicly available)
+        self.embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
+        # Main Arabic LLM with local fallback
         try:
             self.tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b-chat")
             self.llm = AutoModelForCausalLM.from_pretrained("inception-mbzuai/jais-13b-chat")
         except:
+            try:
+                self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")
+                self.llm = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-base")
+            except:
+                raise Exception("Failed to load any Arabic language model. Please check internet connection and try again.")
+        self.index = faiss.IndexFlatL2(384)  # Multilingual MiniLM uses 384-dim
     def generate_answer(self, question: str, documents: List[Dict],
+                      top_k: int = 3, temperature: float = 0.7) -> tuple:
+        """Robust generation with guaranteed fallbacks"""
         # Index documents
         texts = [doc["text"] for doc in documents]
         self.index.add(np.array(self.embedding_model.encode(texts)))
+        # Simple semantic search (no cross-encoder dependency)
         query_embedding = self.embedding_model.encode([question])
+        distances, indices = self.index.search(query_embedding, top_k)
         # Prepare context with metadata
         context = "\n\n".join([
             f"المرجع: {documents[idx]['source']}\n"
             f"الصفحة: {documents[idx].get('page', 'N/A')}\n"
             f"النص: {documents[idx]['text']}\n"
+            for idx in indices[0]
         ])
+        # Generation with bulletproof prompt
         prompt = f"""
+        أنت مساعد ذكي متخصص في النصوص الدينية العربية. أجب على السؤال بناءً على السياق التالي فقط:
         السياق:
         {context}
         السؤال: {question}
         التعليمات:
+        1. استخدم المعلومات من السياق فقط
+        2. أجب باللغة العربية الفصحى
+        3. أشر إلى المصادر بهذا الشكل: [المرجع: اسم الملف، الصفحة]
+        4. إذا لم تجد إجابة واضحة قل "لا توجد معلومات كافية في النصوص المقدمة"
         الإجابة:
         """.strip()
             )
             answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             answer = answer.split("الإجابة:")[-1].strip()
+        except Exception as e:
+            answer = f"عذراً، حدث خطأ في معالجة السؤال. التفاصيل: {str(e)}"
         # Prepare sources
         sources = [{
             "text": documents[idx]["text"],
             "source": documents[idx]["source"],
             "page": documents[idx].get("page", "N/A"),
+            "score": float(1 - distances[0][i])
+        } for i, idx in enumerate(indices[0])]
         return answer, sources