File size: 1,756 Bytes
4254fda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import extract_text_from_files

class RAGPipeline:
    def __init__(self):
        print("[RAG] جاري تحميل النموذج والمحول...")
        self.embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
        self.generator = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", trust_remote_code=True, device_map="auto")
        self.db = None
        print("[RAG] تم التحميل بنجاح.")

    def load_and_index(self, files):
        text = extract_text_from_files(files)
        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        chunks = splitter.split_text(text)
        self.db = Chroma.from_texts(chunks, embedding=self.embedding_model)
        return f"[RAG] تم بناء الفهرس لـ {len(chunks)} مقاطع."

    def answer_question(self, question):
        if self.db is None:
            return "⚠️ لم يتم تحميل مستندات.", []
        docs = self.db.similarity_search(question, k=3)
        context = "\n".join([doc.page_content for doc in docs])
        prompt = f"أجب عن السؤال التالي بناءً على المراجع التالية فقط:\n{context}\n\nالسؤال: {question}\nالإجابة:"
        result = self.generator(prompt, max_new_tokens=200)[0]["generated_text"]
        answer = result.split("الإجابة:")[-1].strip()
        return answer, [doc.page_content for doc in docs]