Spaces:
Sleeping
Sleeping
| from sentence_transformers import SentenceTransformer | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from utils import extract_text_from_files | |
| class RAGPipeline: | |
| def __init__(self): | |
| print("[RAG] جاري تحميل النموذج والمحول...") | |
| self.embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base") | |
| self.generator = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", trust_remote_code=True, device_map="auto") | |
| self.db = None | |
| print("[RAG] تم التحميل بنجاح.") | |
| def load_and_index(self, files): | |
| text = extract_text_from_files(files) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| chunks = splitter.split_text(text) | |
| self.db = Chroma.from_texts(chunks, embedding=self.embedding_model) | |
| return f"[RAG] تم بناء الفهرس لـ {len(chunks)} مقاطع." | |
| def answer_question(self, question): | |
| if self.db is None: | |
| return "⚠️ لم يتم تحميل مستندات.", [] | |
| docs = self.db.similarity_search(question, k=3) | |
| context = "\n".join([doc.page_content for doc in docs]) | |
| prompt = f"أجب عن السؤال التالي بناءً على المراجع التالية فقط:\n{context}\n\nالسؤال: {question}\nالإجابة:" | |
| result = self.generator(prompt, max_new_tokens=200)[0]["generated_text"] | |
| answer = result.split("الإجابة:")[-1].strip() | |
| return answer, [doc.page_content for doc in docs] |