from sentence_transformers import SentenceTransformer from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from utils import extract_text_from_files class RAGPipeline: def __init__(self): print("[RAG] جاري تحميل النموذج والمحول...") self.embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base") self.generator = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", trust_remote_code=True, device_map="auto") self.db = None print("[RAG] تم التحميل بنجاح.") def load_and_index(self, files): text = extract_text_from_files(files) splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = splitter.split_text(text) self.db = Chroma.from_texts(chunks, embedding=self.embedding_model) return f"[RAG] تم بناء الفهرس لـ {len(chunks)} مقاطع." def answer_question(self, question): if self.db is None: return "⚠️ لم يتم تحميل مستندات.", [] docs = self.db.similarity_search(question, k=3) context = "\n".join([doc.page_content for doc in docs]) prompt = f"أجب عن السؤال التالي بناءً على المراجع التالية فقط:\n{context}\n\nالسؤال: {question}\nالإجابة:" result = self.generator(prompt, max_new_tokens=200)[0]["generated_text"] answer = result.split("الإجابة:")[-1].strip() return answer, [doc.page_content for doc in docs]