ramysaidagieb commited on
Commit
d05dffb
·
verified ·
1 Parent(s): 20daaea

Update rag_pipeline.py

Browse files
Files changed (1) hide show
  1. rag_pipeline.py +41 -25
rag_pipeline.py CHANGED
@@ -1,31 +1,47 @@
 
 
 
 
 
 
1
  from sentence_transformers import SentenceTransformer
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
- from langchain.vectorstores import Chroma
4
- from langchain.embeddings import HuggingFaceEmbeddings
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from utils import extract_text_from_files
7
 
8
  class RAGPipeline:
9
  def __init__(self):
10
- print("[RAG] جاري تحميل النموذج والمحول...")
11
- self.embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
12
- self.generator = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", trust_remote_code=True, device_map="auto")
13
- self.db = None
14
- print("[RAG] تم التحميل بنجاح.")
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- def load_and_index(self, files):
17
- text = extract_text_from_files(files)
18
- splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
19
- chunks = splitter.split_text(text)
20
- self.db = Chroma.from_texts(chunks, embedding=self.embedding_model)
21
- return f"[RAG] تم بناء الفهرس لـ {len(chunks)} مقاطع."
22
 
23
- def answer_question(self, question):
24
- if self.db is None:
25
- return "⚠️ لم يتم تحميل مستندات.", []
26
- docs = self.db.similarity_search(question, k=3)
27
- context = "\n".join([doc.page_content for doc in docs])
28
- prompt = f"أجب عن السؤال التالي بناءً على المراجع التالية فقط:\n{context}\n\nالسؤال: {question}\nالإجابة:"
29
- result = self.generator(prompt, max_new_tokens=200)[0]["generated_text"]
30
- answer = result.split("الإجابة:")[-1].strip()
31
- return answer, [doc.page_content for doc in docs]
 
1
+ # rag_pipeline.py
2
+ import time
3
+ import logging
4
+ import numpy as np
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
7
  from sentence_transformers import SentenceTransformer
8
+ import chromadb
9
+ from chromadb.config import Settings
10
+
11
+ logger = logging.getLogger("RAG")
 
12
 
13
  class RAGPipeline:
14
  def __init__(self):
15
+ logger.info("[RAG] جاري تحميل النموذج والمحول...")
16
+ self.embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
17
+ self.chunk_embeddings = []
18
+ self.chunks = []
19
+ self.client = chromadb.Client(Settings(chroma_db_impl="memory", persist_directory=None))
20
+ self.collection = self.client.create_collection(name="rag_collection")
21
+ self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
22
+ self.lm = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
23
+ logger.info("[RAG] تم التحميل بنجاح.")
24
+
25
+ def build_index(self, chunks):
26
+ start_time = time.time()
27
+ self.chunks = chunks
28
+ self.chunk_embeddings = self.embedding_model.encode(chunks, show_progress_bar=True)
29
+ logger.info(f"[RAG] تم بناء الفهرس بأبعاد {self.chunk_embeddings.shape[1]} في {time.time() - start_time:.2f} ثانية.")
30
+ for i, chunk in enumerate(chunks):
31
+ self.collection.add(documents=[chunk], ids=[str(i)], embeddings=[self.chunk_embeddings[i].tolist()])
32
 
33
+ def retrieve(self, query, k=5):
34
+ logger.info("[RAG] استرجاع المقاطع الأكثر صلة بالسؤال...")
35
+ query_embedding = self.embedding_model.encode([query])[0].tolist()
36
+ results = self.collection.query(query_embeddings=[query_embedding], n_results=k)
37
+ return results["documents"][0], results["ids"][0]
 
38
 
39
+ def generate_answer(self, query):
40
+ docs, ids = self.retrieve(query)
41
+ context = "\n\n".join(docs)
42
+ prompt = f"السياق:\n{context}\n\nالسؤال: {query}\nالإجابة:"
43
+ inputs = self.tokenizer(prompt, return_tensors="pt")
44
+ with torch.no_grad():
45
+ outputs = self.lm.generate(**inputs, max_new_tokens=200)
46
+ answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
47
+ return answer, context