ramysaidagieb commited on
Commit
3de43f6
·
verified ·
1 Parent(s): a04746d

Update rag_pipeline.py

Browse files
Files changed (1) hide show
  1. rag_pipeline.py +25 -73
rag_pipeline.py CHANGED
@@ -1,82 +1,34 @@
1
- from transformers import AutoTokenizer, AutoModel, pipeline
2
- import torch
3
- import chromadb
4
  import time
5
 
6
  class RAGPipeline:
7
  def __init__(self):
8
  print("[RAG] جاري تحميل النموذج والمحول...")
9
- start = time.time()
10
-
11
- self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
12
- self.model = AutoModel.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
13
-
14
- # تحويل النموذج إلى وضع التقييم فقط
15
- self.model.eval()
16
-
17
- self.embeddings_pipeline = pipeline("feature-extraction", model=self.model, tokenizer=self.tokenizer)
18
- self.chroma_client = chromadb.Client()
19
- self.chroma_collection = self.chroma_client.get_or_create_collection(name="rag_arabic_docs")
20
-
21
- self.chunk_embeddings = []
22
  self.chunks = []
23
- print(f"[RAG] تم التحميل بنجاح في {time.time() - start:.2f} ثانية.")
24
-
25
- def build_index(self, chunks, log_callback=None):
26
  self.chunk_embeddings = []
27
- self.chunks = chunks
28
- start_time = time.time()
29
- total = len(chunks)
30
-
31
- for i, chunk in enumerate(chunks):
32
- if log_callback and i % 10 == 0:
33
- log_callback(f"[RAG] تم معالجة {i}/{total} مقاطع.")
34
-
35
- embedding = self.embeddings_pipeline(chunk, truncation=True, padding=True)
36
- embedding_vector = torch.tensor(embedding[0]).mean(dim=0).tolist()
37
- self.chunk_embeddings.append(embedding_vector)
38
 
39
- dim = len(self.chunk_embeddings[0])
40
- self.chroma_collection.delete()
41
- for i, emb in enumerate(self.chunk_embeddings):
42
- self.chroma_collection.add(
43
- documents=[self.chunks[i]],
44
- embeddings=[emb],
45
- ids=[str(i)]
46
- )
47
-
48
- if log_callback:
49
- log_callback(f"[RAG] تم بناء الفهرس بأبعاد {dim} في {time.time() - start_time:.2f} ثانية.")
50
- return "تم بناء الفهرس."
51
-
52
- def answer_question(self, question, log_callback=None):
53
- if not self.chunk_embeddings:
54
- return "⚠️ لم يتم تحميل أو فهرسة أي ملفات بعد."
55
-
56
- if log_callback:
57
- log_callback(f"[RAG] جاري استخراج أفضل مقاطع للسؤال: {question}")
58
-
59
- # استخراج التضمين للسؤال
60
- question_emb = self.embeddings_pipeline(question, truncation=True, padding=True)
61
- question_vector = torch.tensor(question_emb[0]).mean(dim=0).tolist()
62
-
63
- # استرجاع أفضل 3 مقاطع
64
- results = self.chroma_collection.query(query_embeddings=[question_vector], n_results=3)
65
- docs = results["documents"][0]
66
- context = "\n".join(docs)
67
-
68
- if log_callback:
69
- log_callback("[RAG] تم استخراج المقاطع التالية للإجابة:\n" + context)
70
-
71
- # توليد الإجابة
72
- full_prompt = f"السؤال: {question}\n\nالمقاطع المرجعية:\n{context}\n\nالإجابة:"
73
- inputs = self.tokenizer(full_prompt, return_tensors="pt", truncation=True)
74
- with torch.no_grad():
75
- outputs = self.model.generate(
76
- **inputs,
77
- max_new_tokens=200,
78
- do_sample=True,
79
- temperature=0.7
80
- )
81
  answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
82
- return answer
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+ from sentence_transformers import SentenceTransformer
3
+ import numpy as np
4
  import time
5
 
6
  class RAGPipeline:
7
  def __init__(self):
8
  print("[RAG] جاري تحميل النموذج والمحول...")
9
+ self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
10
+ self.model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")
11
+ self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
12
+ self.index = None
 
 
 
 
 
 
 
 
 
13
  self.chunks = []
 
 
 
14
  self.chunk_embeddings = []
15
+ print("[RAG] تم التحميل بنجاح.")
 
 
 
 
 
 
 
 
 
 
16
 
17
+ def build_index(self, chunks, logs=None):
18
+ self.chunks = chunks
19
+ self.chunk_embeddings = self.embedder.encode(chunks, convert_to_numpy=True)
20
+ if logs is not None:
21
+ logs.append(f"[RAG] تم بناء الفهرس بأبعاد {self.chunk_embeddings.shape}")
22
+ self.index = np.array(self.chunk_embeddings)
23
+
24
+ def answer(self, question):
25
+ question_embedding = self.embedder.encode([question], convert_to_numpy=True)
26
+ # بحث عن أقرب 5 مقاطع
27
+ similarities = np.dot(self.index, question_embedding.T).squeeze()
28
+ top_idx = similarities.argsort()[-5:][::-1]
29
+ context = "\n".join([self.chunks[i] for i in top_idx])
30
+ inputs = self.tokenizer.encode(question + " " + context, return_tensors="pt", max_length=512, truncation=True)
31
+ outputs = self.model.generate(inputs, max_length=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
33
+ sources = [self.chunks[i] for i in top_idx]
34
+ return answer, sources