ramysaidagieb commited on
Commit
286b392
·
verified ·
1 Parent(s): 848b322

Update rag_pipeline.py

Browse files
Files changed (1) hide show
  1. rag_pipeline.py +70 -35
rag_pipeline.py CHANGED
@@ -1,47 +1,82 @@
1
- # rag_pipeline.py
2
- import time
3
- import logging
4
- import numpy as np
5
  import torch
6
- from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
7
- from sentence_transformers import SentenceTransformer
8
  import chromadb
9
- from chromadb.config import Settings
10
-
11
- logger = logging.getLogger("RAG")
12
 
13
  class RAGPipeline:
14
  def __init__(self):
15
- logger.info("[RAG] جاري تحميل النموذج والمحول...")
16
- self.embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
 
 
 
 
 
 
 
 
 
 
 
17
  self.chunk_embeddings = []
18
  self.chunks = []
19
- self.client = chromadb.Client(Settings(chroma_db_impl="memory", persist_directory=None))
20
- self.collection = self.client.create_collection(name="rag_collection")
21
- self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
22
- self.lm = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
23
- logger.info("[RAG] تم التحميل بنجاح.")
24
 
25
- def build_index(self, chunks):
26
- start_time = time.time()
27
  self.chunks = chunks
28
- self.chunk_embeddings = self.embedding_model.encode(chunks, show_progress_bar=True)
29
- logger.info(f"[RAG] تم بناء الفهرس بأبعاد {self.chunk_embeddings.shape[1]} في {time.time() - start_time:.2f} ثانية.")
 
30
  for i, chunk in enumerate(chunks):
31
- self.collection.add(documents=[chunk], ids=[str(i)], embeddings=[self.chunk_embeddings[i].tolist()])
32
-
33
- def retrieve(self, query, k=5):
34
- logger.info("[RAG] استرجاع المقاطع الأكثر صلة بالسؤال...")
35
- query_embedding = self.embedding_model.encode([query])[0].tolist()
36
- results = self.collection.query(query_embeddings=[query_embedding], n_results=k)
37
- return results["documents"][0], results["ids"][0]
38
-
39
- def generate_answer(self, query):
40
- docs, ids = self.retrieve(query)
41
- context = "\n\n".join(docs)
42
- prompt = f"السياق:\n{context}\n\nالسؤال: {query}\nالإجابة:"
43
- inputs = self.tokenizer(prompt, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  with torch.no_grad():
45
- outputs = self.lm.generate(**inputs, max_new_tokens=200)
 
 
 
 
 
46
  answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
47
- return answer, context
 
1
+ from transformers import AutoTokenizer, AutoModel, pipeline
 
 
 
2
  import torch
 
 
3
  import chromadb
4
+ import time
 
 
5
 
6
  class RAGPipeline:
7
  def __init__(self):
8
+ print("[RAG] جاري تحميل النموذج والمحول...")
9
+ start = time.time()
10
+
11
+ self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
12
+ self.model = AutoModel.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
13
+
14
+ # تحويل النموذج إلى وضع التقييم فقط
15
+ self.model.eval()
16
+
17
+ self.embeddings_pipeline = pipeline("feature-extraction", model=self.model, tokenizer=self.tokenizer)
18
+ self.chroma_client = chromadb.Client()
19
+ self.chroma_collection = self.chroma_client.get_or_create_collection(name="rag_arabic_docs")
20
+
21
  self.chunk_embeddings = []
22
  self.chunks = []
23
+ print(f"[RAG] تم التحميل بنجاح في {time.time() - start:.2f} ثانية.")
 
 
 
 
24
 
25
+ def build_index(self, chunks, log_callback=None):
26
+ self.chunk_embeddings = []
27
  self.chunks = chunks
28
+ start_time = time.time()
29
+ total = len(chunks)
30
+
31
  for i, chunk in enumerate(chunks):
32
+ if log_callback and i % 10 == 0:
33
+ log_callback(f"[RAG] تم معالجة {i}/{total} مقاطع.")
34
+
35
+ embedding = self.embeddings_pipeline(chunk, truncation=True, padding=True)
36
+ embedding_vector = torch.tensor(embedding[0]).mean(dim=0).tolist()
37
+ self.chunk_embeddings.append(embedding_vector)
38
+
39
+ dim = len(self.chunk_embeddings[0])
40
+ self.chroma_collection.delete()
41
+ for i, emb in enumerate(self.chunk_embeddings):
42
+ self.chroma_collection.add(
43
+ documents=[self.chunks[i]],
44
+ embeddings=[emb],
45
+ ids=[str(i)]
46
+ )
47
+
48
+ if log_callback:
49
+ log_callback(f"[RAG] تم بناء الفهرس بأبعاد {dim} في {time.time() - start_time:.2f} ثانية.")
50
+ return "تم بناء الفهرس."
51
+
52
+ def answer_question(self, question, log_callback=None):
53
+ if not self.chunk_embeddings:
54
+ return "⚠️ لم يتم تحميل أو فهرسة أي ملفات بعد."
55
+
56
+ if log_callback:
57
+ log_callback(f"[RAG] جاري استخراج أفضل مقاطع للسؤال: {question}")
58
+
59
+ # استخراج التضمين للسؤال
60
+ question_emb = self.embeddings_pipeline(question, truncation=True, padding=True)
61
+ question_vector = torch.tensor(question_emb[0]).mean(dim=0).tolist()
62
+
63
+ # استرجاع أفضل 3 مقاطع
64
+ results = self.chroma_collection.query(query_embeddings=[question_vector], n_results=3)
65
+ docs = results["documents"][0]
66
+ context = "\n".join(docs)
67
+
68
+ if log_callback:
69
+ log_callback("[RAG] تم استخراج المقاطع التالية للإجابة:\n" + context)
70
+
71
+ # توليد الإجابة
72
+ full_prompt = f"السؤال: {question}\n\nالمقاط�� المرجعية:\n{context}\n\nالإجابة:"
73
+ inputs = self.tokenizer(full_prompt, return_tensors="pt", truncation=True)
74
  with torch.no_grad():
75
+ outputs = self.model.generate(
76
+ **inputs,
77
+ max_new_tokens=200,
78
+ do_sample=True,
79
+ temperature=0.7
80
+ )
81
  answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
82
+ return answer