Spaces:

ramysaidagieb
/

rag22v2

Runtime error

App Files Files Community

ramysaidagieb commited on May 22

Commit

b82ee60

verified ·

1 Parent(s): c90b40e

Update rag_pipeline.py

Browse files

Files changed (1) hide show

rag_pipeline.py +63 -51

rag_pipeline.py CHANGED Viewed

@@ -6,79 +6,91 @@ from typing import List, Dict
 class ArabicRAGSystem:
     def __init__(self):
-        # Initialize models
-        self.embedding_model = SentenceTransformer("aubmindlab/bert-base-arabertv2")
-        self.cross_encoder = CrossEncoder("Arabic-Misc/roberta-base-arabic-camelbert-da-msa")
-        self.tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b-chat")
-        self.llm = AutoModelForCausalLM.from_pretrained("inception-mbzuai/jais-13b-chat")
-        self.index = faiss.IndexFlatL2(768)
-    def _create_index(self, documents: List[Dict]):
-        texts = [doc["text"] for doc in documents]
-        embeddings = self.embedding_model.encode(texts)
-        self.index.add(np.array(embeddings))
     def generate_answer(self, question: str, documents: List[Dict],
                       top_k: int = 5, temperature: float = 0.7) -> tuple:
-        # Indexing phase
-        self._create_index(documents)
-        # Two-stage retrieval
         query_embedding = self.embedding_model.encode([question])
         distances, indices = self.index.search(query_embedding, top_k*2)
-        # Re-ranking with cross-encoder
-        pairs = [[question, documents[idx]["text"]] for idx in indices[0]]
-        scores = self.cross_encoder.predict(pairs)
-        ranked_indices = np.argsort(scores)[::-1][:top_k]
-        # Prepare context
         context = "\n\n".join([
-            f"المصدر: {documents[idx]['source']}\n"
-            f"الصفحة: {documents[idx]['page']}\n"
-            f"النص: {documents[idx]['text']}"
-            for idx in [indices[0][i] for i in ranked_indices]
         ])
-        # Generate answer
         prompt = f"""
-        أنت خبير في التحليل الديني. قم بالإجابة على السؤال التالي بناءً على السياق المقدم فقط:
         السياق:
         {context}
-        السؤال:
-        {question}
         التعليمات:
-        - أجب باللغة العربية الفصحى
-        - استخدم علامات التنسيق المناسبة
-        - أشر إلى المصادر باستخدام التنسيق [المصدر: اسم الملف، الصفحة: رقم]
-        - إذا لم توجد إجابة واضحة، قل "لا تتوفر معلومات كافية"
         الإجابة:
         """.strip()
-        inputs = self.tokenizer(prompt, return_tensors="pt")
-        outputs = self.llm.generate(
-            inputs.input_ids,
-            max_new_tokens=512,
-            temperature=temperature,
-            do_sample=True,
-            pad_token_id=self.tokenizer.eos_token_id
-        )
-        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        answer = answer.split("الإجابة:")[-1].strip()
         # Prepare sources
-        sources = []
-        for idx in [indices[0][i] for i in ranked_indices]:
-            sources.append({
-                "text": documents[idx]["text"],
-                "source": documents[idx]["source"],
-                "page": documents[idx]["page"],
-                "score": float(scores[idx])
-            })
         return answer, sources

 class ArabicRAGSystem:
     def __init__(self):
+        """Initialize with fallback models for Arabic support"""
+        # Solution 1: Use reliable Arabic embedding model
+        self.embedding_model = SentenceTransformer("UBC-NLP/AraBERT")
+        # Solution 2: Fallback cross-encoder options
+        try:
+            self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")  # Multilingual fallback
+        except:
+            self.cross_encoder = None  # System will work without it
+        # Solution 3: Main Arabic LLM with error handling
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b-chat")
+            self.llm = AutoModelForCausalLM.from_pretrained("inception-mbzuai/jais-13b-chat")
+        except:
+            # Fallback to smaller Arabic model
+            self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")
+            self.llm = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-base")
+        self.index = faiss.IndexFlatL2(768)  # AraBERT uses 768-dim embeddings
     def generate_answer(self, question: str, documents: List[Dict],
                       top_k: int = 5, temperature: float = 0.7) -> tuple:
+        """Enhanced with fallback retrieval methods"""
+        # Index documents
+        texts = [doc["text"] for doc in documents]
+        self.index.add(np.array(self.embedding_model.encode(texts)))
+        # Two-phase retrieval with fallback
         query_embedding = self.embedding_model.encode([question])
         distances, indices = self.index.search(query_embedding, top_k*2)
+        # Solution 4: Cross-encoder fallback logic
+        if self.cross_encoder:
+            pairs = [[question, documents[idx]["text"]] for idx in indices[0]]
+            scores = self.cross_encoder.predict(pairs)
+            top_indices = [indices[0][i] for i in np.argsort(scores)[-top_k:][::-1]]
+        else:
+            top_indices = indices[0][:top_k]
+        # Prepare context with metadata
         context = "\n\n".join([
+            f"المرجع: {documents[idx]['source']}\n"
+            f"الصفحة: {documents[idx].get('page', 'N/A')}\n"
+            f"النص: {documents[idx]['text']}\n"
+            for idx in top_indices
         ])
+        # Generation with error handling
         prompt = f"""
+        نظام التحليل الديني العربي:
         السياق:
         {context}
+        السؤال: {question}
         التعليمات:
+        - أجب باللغة العربية الفصحى فقط
+        - استخدم المعلومات من السياق فقط
+        - أشر إلى المصادر باستخدام [المرجع: اسم الملف، الصفحة]
+        - إذا لم تجد إجابة واضحة قل "لا تتوفر معلومات كافية"
         الإجابة:
         """.strip()
+        try:
+            inputs = self.tokenizer(prompt, return_tensors="pt")
+            outputs = self.llm.generate(
+                inputs.input_ids,
+                max_new_tokens=512,
+                temperature=temperature,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+            answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            answer = answer.split("الإجابة:")[-1].strip()
+        except:
+            answer = "عذراً، حدث خطأ في معالجة السؤال. يرجى المحاولة مرة أخرى."
         # Prepare sources
+        sources = [{
+            "text": documents[idx]["text"],
+            "source": documents[idx]["source"],
+            "page": documents[idx].get("page", "N/A"),
+            "score": float(1 - distances[0][i]) if self.cross_encoder else 0.0
+        } for i, idx in enumerate(top_indices)]
         return answer, sources