from sentence_transformers import SentenceTransformer import faiss import numpy as np from transformers import pipeline from docx import Document class ArabicRAGPipeline: def __init__(self): self.embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') self.retriever_index = None self.text_chunks = [] self.chunk_embeddings = None self.generator = pipeline( "text-generation", model="NousResearch/Nous-Hermes-2-Mistral", tokenizer="NousResearch/Nous-Hermes-2-Mistral", max_new_tokens=256 ) def build_index(self, chunks): self.text_chunks = chunks texts = [chunk[0] for chunk in chunks] self.chunk_embeddings = self.embedding_model.encode(texts, convert_to_tensor=False) dim = self.chunk_embeddings[0].shape[0] self.retriever_index = faiss.IndexFlatL2(dim) self.retriever_index.add(np.array(self.chunk_embeddings)) def retrieve(self, query, top_k=3): query_vec = self.embedding_model.encode([query])[0] scores, indices = self.retriever_index.search(np.array([query_vec]), top_k) return [self.text_chunks[i] for i in indices[0]] def generate_answer(self, query, retrieved_passages): context = "\n\n".join(p for p, _ in retrieved_passages) prompt = f"""أجب باللغة العربية الفصحى على السؤال التالي، بالاعتماد فقط على النصوص التالية. قدم إجابة مدعومة من النص الأصلي، واذكر المرجع المستخدم: النصوص: {context} السؤال: {query} الإجابة:""" response = self.generator(prompt)[0]['generated_text'] return response.split("الإجابة:")[-1].strip(), retrieved_passages def save_to_doc(answer, citations): doc = Document() doc.add_heading("الإجابة", level=1) doc.add_paragraph(answer) doc.add_heading("المصادر", level=2) doc.add_paragraph(citations) path = "/tmp/faith_answer.docx" doc.save(path) return path