ramysaidagieb commited on
Commit
4f78a11
·
verified ·
1 Parent(s): 5568b95

Update rag_pipeline.py

Browse files
Files changed (1) hide show
  1. rag_pipeline.py +49 -27
rag_pipeline.py CHANGED
@@ -1,31 +1,53 @@
1
  import time
2
- import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
- import chromadb
5
- from chromadb.config import Settings
6
 
7
  class RAGPipeline:
8
  def __init__(self):
9
- self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-mega")
10
- self.model = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-mega")
11
- self.generator = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer, device=0 if torch.cuda.is_available() else -1)
12
- self.client = chromadb.Client(Settings(allow_reset=True))
13
- self.collection = None
14
-
15
- def build_index(self, passages):
16
- self.client.reset()
17
- self.collection = self.client.create_collection(name="documents")
18
- documents = [p["text"] for p in passages]
19
- metadatas = [{"source": p["source"]} for p in passages]
20
- ids = [str(i) for i in range(len(documents))]
21
- self.collection.add(documents=documents, metadatas=metadatas, ids=ids)
22
-
23
- def generate_answer(self, question):
24
- if not self.collection:
25
- return "⚠️ لا يوجد فهرس معرف.", []
26
- results = self.collection.query(query_texts=[question], n_results=3)
27
- retrieved = [d for d in results['documents'][0]]
28
- metadatas = results['metadatas'][0]
29
- prompt = question + "\n" + "\n".join(retrieved)
30
- result = self.generator(prompt, max_new_tokens=150, do_sample=True)[0]['generated_text']
31
- return result, [f"{md['source']}: {text[:60]}..." for md, text in zip(metadatas, retrieved)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import time
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ import numpy as np
 
 
4
 
5
  class RAGPipeline:
6
  def __init__(self):
7
+ print("[RAG] Initializing tokenizer and model...")
8
+ self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
9
+ self.generator = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
10
+ self.chunk_embeddings = []
11
+ self.index = []
12
+ print("[RAG] Initialization done.")
13
+
14
+ def build_index(self, chunks):
15
+ start_time = time.time()
16
+ print(f"[RAG] Building index for {len(chunks)} chunks...")
17
+ self.chunk_embeddings = []
18
+ self.index = []
19
+
20
+ for i, chunk in enumerate(chunks):
21
+ # هنا يمكن إضافة embedding حقيقي عبر نموذج أو طريقة أخرى
22
+ embedding = self._dummy_embedding(chunk)
23
+ self.chunk_embeddings.append(embedding)
24
+ self.index.append(chunk)
25
+ if i % 10 == 0:
26
+ print(f"[RAG] Processed {i+1}/{len(chunks)} chunks.")
27
+
28
+ self.chunk_embeddings = np.array(self.chunk_embeddings)
29
+ dim = self.chunk_embeddings.shape[1]
30
+ print(f"[RAG] Index built with dimension {dim}. Took {time.time()-start_time:.2f} seconds.")
31
+ return "Index built successfully."
32
+
33
+ def _dummy_embedding(self, text):
34
+ # مؤقتاً فقط: تمثيل نصي عشوائي كـ embedding
35
+ return np.random.rand(768)
36
+
37
+ def generate_answer(self, question, passages):
38
+ start_time = time.time()
39
+ print(f"[RAG] Generating answer for question: {question}")
40
+
41
+ # دمج المقتطفات مع السؤال لصياغة سؤال موجه للنموذج
42
+ prompt = question + "\n\nمراجع:\n" + "\n".join(passages)
43
+
44
+ output = self.generator.generate(
45
+ self.tokenizer(prompt, return_tensors="pt").input_ids,
46
+ max_new_tokens=150,
47
+ do_sample=True,
48
+ )
49
+ response = self.tokenizer.decode(output[0], skip_special_tokens=True)
50
+
51
+ elapsed = time.time() - start_time
52
+ print(f"[RAG] Answer generated in {elapsed:.2f} seconds.")
53
+ return response, passages