Spaces:
Runtime error
Runtime error
File size: 4,150 Bytes
c90b40e b82ee60 c90b40e b82ee60 c90b40e b82ee60 c90b40e b82ee60 c90b40e b82ee60 c90b40e b82ee60 c90b40e b82ee60 c90b40e b82ee60 c90b40e b82ee60 c90b40e b82ee60 c90b40e b82ee60 c90b40e b82ee60 c90b40e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
from sentence_transformers import CrossEncoder, SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import faiss
import numpy as np
from typing import List, Dict
class ArabicRAGSystem:
def __init__(self):
"""Initialize with fallback models for Arabic support"""
# Solution 1: Use reliable Arabic embedding model
self.embedding_model = SentenceTransformer("UBC-NLP/AraBERT")
# Solution 2: Fallback cross-encoder options
try:
self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") # Multilingual fallback
except:
self.cross_encoder = None # System will work without it
# Solution 3: Main Arabic LLM with error handling
try:
self.tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b-chat")
self.llm = AutoModelForCausalLM.from_pretrained("inception-mbzuai/jais-13b-chat")
except:
# Fallback to smaller Arabic model
self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")
self.llm = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-base")
self.index = faiss.IndexFlatL2(768) # AraBERT uses 768-dim embeddings
def generate_answer(self, question: str, documents: List[Dict],
top_k: int = 5, temperature: float = 0.7) -> tuple:
"""Enhanced with fallback retrieval methods"""
# Index documents
texts = [doc["text"] for doc in documents]
self.index.add(np.array(self.embedding_model.encode(texts)))
# Two-phase retrieval with fallback
query_embedding = self.embedding_model.encode([question])
distances, indices = self.index.search(query_embedding, top_k*2)
# Solution 4: Cross-encoder fallback logic
if self.cross_encoder:
pairs = [[question, documents[idx]["text"]] for idx in indices[0]]
scores = self.cross_encoder.predict(pairs)
top_indices = [indices[0][i] for i in np.argsort(scores)[-top_k:][::-1]]
else:
top_indices = indices[0][:top_k]
# Prepare context with metadata
context = "\n\n".join([
f"المرجع: {documents[idx]['source']}\n"
f"الصفحة: {documents[idx].get('page', 'N/A')}\n"
f"النص: {documents[idx]['text']}\n"
for idx in top_indices
])
# Generation with error handling
prompt = f"""
نظام التحليل الديني العربي:
السياق:
{context}
السؤال: {question}
التعليمات:
- أجب باللغة العربية الفصحى فقط
- استخدم المعلومات من السياق فقط
- أشر إلى المصادر باستخدام [المرجع: اسم الملف، الصفحة]
- إذا لم تجد إجابة واضحة قل "لا تتوفر معلومات كافية"
الإجابة:
""".strip()
try:
inputs = self.tokenizer(prompt, return_tensors="pt")
outputs = self.llm.generate(
inputs.input_ids,
max_new_tokens=512,
temperature=temperature,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id
)
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = answer.split("الإجابة:")[-1].strip()
except:
answer = "عذراً، حدث خطأ في معالجة السؤال. يرجى المحاولة مرة أخرى."
# Prepare sources
sources = [{
"text": documents[idx]["text"],
"source": documents[idx]["source"],
"page": documents[idx].get("page", "N/A"),
"score": float(1 - distances[0][i]) if self.cross_encoder else 0.0
} for i, idx in enumerate(top_indices)]
return answer, sources |