Spaces:
Sleeping
Sleeping
Update rag_pipeline.py
Browse files- rag_pipeline.py +70 -35
rag_pipeline.py
CHANGED
@@ -1,47 +1,82 @@
|
|
1 |
-
|
2 |
-
import time
|
3 |
-
import logging
|
4 |
-
import numpy as np
|
5 |
import torch
|
6 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
|
7 |
-
from sentence_transformers import SentenceTransformer
|
8 |
import chromadb
|
9 |
-
|
10 |
-
|
11 |
-
logger = logging.getLogger("RAG")
|
12 |
|
13 |
class RAGPipeline:
|
14 |
def __init__(self):
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
self.chunk_embeddings = []
|
18 |
self.chunks = []
|
19 |
-
|
20 |
-
self.collection = self.client.create_collection(name="rag_collection")
|
21 |
-
self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
|
22 |
-
self.lm = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
|
23 |
-
logger.info("[RAG] تم التحميل بنجاح.")
|
24 |
|
25 |
-
def build_index(self, chunks):
|
26 |
-
|
27 |
self.chunks = chunks
|
28 |
-
|
29 |
-
|
|
|
30 |
for i, chunk in enumerate(chunks):
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
with torch.no_grad():
|
45 |
-
outputs = self.
|
|
|
|
|
|
|
|
|
|
|
46 |
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
47 |
-
return answer
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModel, pipeline
|
|
|
|
|
|
|
2 |
import torch
|
|
|
|
|
3 |
import chromadb
|
4 |
+
import time
|
|
|
|
|
5 |
|
6 |
class RAGPipeline:
|
7 |
def __init__(self):
|
8 |
+
print("[RAG] جاري تحميل النموذج والمحول...")
|
9 |
+
start = time.time()
|
10 |
+
|
11 |
+
self.tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
|
12 |
+
self.model = AutoModel.from_pretrained("aubmindlab/aragpt2-mega", trust_remote_code=True)
|
13 |
+
|
14 |
+
# تحويل النموذج إلى وضع التقييم فقط
|
15 |
+
self.model.eval()
|
16 |
+
|
17 |
+
self.embeddings_pipeline = pipeline("feature-extraction", model=self.model, tokenizer=self.tokenizer)
|
18 |
+
self.chroma_client = chromadb.Client()
|
19 |
+
self.chroma_collection = self.chroma_client.get_or_create_collection(name="rag_arabic_docs")
|
20 |
+
|
21 |
self.chunk_embeddings = []
|
22 |
self.chunks = []
|
23 |
+
print(f"[RAG] تم التحميل بنجاح في {time.time() - start:.2f} ثانية.")
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
def build_index(self, chunks, log_callback=None):
|
26 |
+
self.chunk_embeddings = []
|
27 |
self.chunks = chunks
|
28 |
+
start_time = time.time()
|
29 |
+
total = len(chunks)
|
30 |
+
|
31 |
for i, chunk in enumerate(chunks):
|
32 |
+
if log_callback and i % 10 == 0:
|
33 |
+
log_callback(f"[RAG] تم معالجة {i}/{total} مقاطع.")
|
34 |
+
|
35 |
+
embedding = self.embeddings_pipeline(chunk, truncation=True, padding=True)
|
36 |
+
embedding_vector = torch.tensor(embedding[0]).mean(dim=0).tolist()
|
37 |
+
self.chunk_embeddings.append(embedding_vector)
|
38 |
+
|
39 |
+
dim = len(self.chunk_embeddings[0])
|
40 |
+
self.chroma_collection.delete()
|
41 |
+
for i, emb in enumerate(self.chunk_embeddings):
|
42 |
+
self.chroma_collection.add(
|
43 |
+
documents=[self.chunks[i]],
|
44 |
+
embeddings=[emb],
|
45 |
+
ids=[str(i)]
|
46 |
+
)
|
47 |
+
|
48 |
+
if log_callback:
|
49 |
+
log_callback(f"[RAG] تم بناء الفهرس بأبعاد {dim} في {time.time() - start_time:.2f} ثانية.")
|
50 |
+
return "تم بناء الفهرس."
|
51 |
+
|
52 |
+
def answer_question(self, question, log_callback=None):
|
53 |
+
if not self.chunk_embeddings:
|
54 |
+
return "⚠️ لم يتم تحميل أو فهرسة أي ملفات بعد."
|
55 |
+
|
56 |
+
if log_callback:
|
57 |
+
log_callback(f"[RAG] جاري استخراج أفضل مقاطع للسؤال: {question}")
|
58 |
+
|
59 |
+
# استخراج التضمين للسؤال
|
60 |
+
question_emb = self.embeddings_pipeline(question, truncation=True, padding=True)
|
61 |
+
question_vector = torch.tensor(question_emb[0]).mean(dim=0).tolist()
|
62 |
+
|
63 |
+
# استرجاع أفضل 3 مقاطع
|
64 |
+
results = self.chroma_collection.query(query_embeddings=[question_vector], n_results=3)
|
65 |
+
docs = results["documents"][0]
|
66 |
+
context = "\n".join(docs)
|
67 |
+
|
68 |
+
if log_callback:
|
69 |
+
log_callback("[RAG] تم استخراج المقاطع التالية للإجابة:\n" + context)
|
70 |
+
|
71 |
+
# توليد الإجابة
|
72 |
+
full_prompt = f"السؤال: {question}\n\nالمقاط�� المرجعية:\n{context}\n\nالإجابة:"
|
73 |
+
inputs = self.tokenizer(full_prompt, return_tensors="pt", truncation=True)
|
74 |
with torch.no_grad():
|
75 |
+
outputs = self.model.generate(
|
76 |
+
**inputs,
|
77 |
+
max_new_tokens=200,
|
78 |
+
do_sample=True,
|
79 |
+
temperature=0.7
|
80 |
+
)
|
81 |
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
82 |
+
return answer
|