ramysaidagieb commited on
Commit
6a78ac0
·
verified ·
1 Parent(s): 1c9be4e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -16,19 +16,20 @@ col = client.get_or_create_collection(name="arabic_docs")
16
  embedder = SentenceTransformer("sentence-transformers/LaBSE")
17
 
18
  # تقطيع النصوص من PDF
19
- def process_pdf(pdf_bytes):
20
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
 
21
  texts = []
22
- for p in doc:
23
- text = p.get_text()
24
  for chunk in text.split("\n\n"):
25
  if len(chunk.strip()) > 50:
26
  texts.append(chunk.strip())
27
  return texts
28
 
29
  # إدخال النصوص في قاعدة Chroma
30
- def ingest(pdf_bytes):
31
- texts = process_pdf(pdf_bytes)
32
  embeddings = embedder.encode(texts, show_progress_bar=True)
33
  for i, (chunk, emb) in enumerate(zip(texts, embeddings)):
34
  col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
@@ -37,8 +38,7 @@ def ingest(pdf_bytes):
37
  # استرجاع السياق من Chroma
38
  retriever = dspy.Retrieve(lambda q: [m["text"] for m in col.query(q, n_results=1)["metadatas"]])
39
 
40
- # تعريف التوقيع باستخدام InputField و OutputField
41
-
42
  class RagSig(dspy.Signature):
43
  question: str = dspy.InputField()
44
  context: str = dspy.InputField()
 
16
  embedder = SentenceTransformer("sentence-transformers/LaBSE")
17
 
18
  # تقطيع النصوص من PDF
19
+ def process_pdf(pdf_file):
20
+ # استخدام مسار الملف مباشرة
21
+ doc = fitz.open(pdf_file.name)
22
  texts = []
23
+ for page in doc:
24
+ text = page.get_text()
25
  for chunk in text.split("\n\n"):
26
  if len(chunk.strip()) > 50:
27
  texts.append(chunk.strip())
28
  return texts
29
 
30
  # إدخال النصوص في قاعدة Chroma
31
+ def ingest(pdf_file):
32
+ texts = process_pdf(pdf_file)
33
  embeddings = embedder.encode(texts, show_progress_bar=True)
34
  for i, (chunk, emb) in enumerate(zip(texts, embeddings)):
35
  col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
 
38
  # استرجاع السياق من Chroma
39
  retriever = dspy.Retrieve(lambda q: [m["text"] for m in col.query(q, n_results=1)["metadatas"]])
40
 
41
+ # تعريف التوقيع باستخدام InputField و OutputField
 
42
  class RagSig(dspy.Signature):
43
  question: str = dspy.InputField()
44
  context: str = dspy.InputField()