Spaces:
Sleeping
Sleeping
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
from transformers import pipeline | |
from docx import Document | |
class ArabicRAGPipeline: | |
def __init__(self): | |
self.embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') | |
self.retriever_index = None | |
self.text_chunks = [] | |
self.chunk_embeddings = None | |
self.generator = pipeline( | |
"text-generation", | |
model="NousResearch/Nous-Hermes-2-Mistral", | |
tokenizer="NousResearch/Nous-Hermes-2-Mistral", | |
max_new_tokens=256 | |
) | |
def build_index(self, chunks): | |
self.text_chunks = chunks | |
texts = [chunk[0] for chunk in chunks] | |
self.chunk_embeddings = self.embedding_model.encode(texts, convert_to_tensor=False) | |
dim = self.chunk_embeddings[0].shape[0] | |
self.retriever_index = faiss.IndexFlatL2(dim) | |
self.retriever_index.add(np.array(self.chunk_embeddings)) | |
def retrieve(self, query, top_k=3): | |
query_vec = self.embedding_model.encode([query])[0] | |
scores, indices = self.retriever_index.search(np.array([query_vec]), top_k) | |
return [self.text_chunks[i] for i in indices[0]] | |
def generate_answer(self, query, retrieved_passages): | |
context = "\n\n".join(p for p, _ in retrieved_passages) | |
prompt = f"""أجب باللغة العربية الفصحى على السؤال التالي، بالاعتماد فقط على النصوص التالية. قدم إجابة مدعومة من النص الأصلي، واذكر المرجع المستخدم: | |
النصوص: | |
{context} | |
السؤال: {query} | |
الإجابة:""" | |
response = self.generator(prompt)[0]['generated_text'] | |
return response.split("الإجابة:")[-1].strip(), retrieved_passages | |
def save_to_doc(answer, citations): | |
doc = Document() | |
doc.add_heading("الإجابة", level=1) | |
doc.add_paragraph(answer) | |
doc.add_heading("المصادر", level=2) | |
doc.add_paragraph(citations) | |
path = "/tmp/faith_answer.docx" | |
doc.save(path) | |
return path |