ramysaidagieb commited on
Commit
a04746d
·
verified ·
1 Parent(s): 717f924

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +28 -58
utils.py CHANGED
@@ -1,61 +1,31 @@
1
  import os
2
- import docx
3
- import PyPDF2
4
 
5
- def extract_text_from_pdf(file):
 
6
  text = ""
7
- try:
8
- pdf_reader = PyPDF2.PdfReader(file)
9
- for page in pdf_reader.pages:
10
- text += page.extract_text() + "\n"
11
- except Exception as e:
12
- text += f"\n[خطأ في قراءة PDF: {e}]\n"
13
- return text
14
-
15
- def extract_text_from_docx(file):
16
- doc = docx.Document(file)
17
- return "\n".join([para.text for para in doc.paragraphs])
18
-
19
- def extract_text_from_txt(file):
20
- return file.read().decode("utf-8")
21
-
22
- def chunk_text(text, chunk_size=300, overlap=50):
23
- words = text.split()
24
- chunks = []
25
- start = 0
26
- while start < len(words):
27
- end = min(start + chunk_size, len(words))
28
- chunk = " ".join(words[start:end])
29
- chunks.append(chunk)
30
- start += chunk_size - overlap
31
- return chunks
32
-
33
- def process_documents(files, log_callback=None):
34
- all_chunks = []
35
- for file in files:
36
- filename = os.path.basename(file)
37
- ext = filename.split(".")[-1].lower()
38
- if log_callback:
39
- log_callback(f"📁 معالجة الملف: {filename}")
40
-
41
- try:
42
- if ext == "pdf":
43
- text = extract_text_from_pdf(file)
44
- elif ext == "docx":
45
- text = extract_text_from_docx(file)
46
- elif ext == "txt":
47
- text = extract_text_from_txt(file)
48
- else:
49
- if log_callback:
50
- log_callback(f"❗️ تنسيق غير مدعوم: {ext}")
51
- continue
52
-
53
- chunks = chunk_text(text)
54
- all_chunks.extend(chunks)
55
-
56
- if log_callback:
57
- log_callback(f"✅ تم استخراج {len(chunks)} مقطع من {filename}")
58
- except Exception as e:
59
- if log_callback:
60
- log_callback(f"⚠️ فشل في معالجة {filename}: {e}")
61
- return all_chunks
 
1
  import os
2
+ from PyPDF2 import PdfReader
3
+ from docx import Document
4
 
5
+ def process_pdf(file_path):
6
+ reader = PdfReader(file_path)
7
  text = ""
8
+ for page in reader.pages:
9
+ text += page.extract_text() + "\n"
10
+ return text.split('\n\n') # تقسيم النص إلى فقرات
11
+
12
+ def process_docx(file_path):
13
+ doc = Document(file_path)
14
+ paragraphs = [p.text for p in doc.paragraphs if p.text.strip() != ""]
15
+ return paragraphs
16
+
17
+ def process_txt(file_path):
18
+ with open(file_path, 'r', encoding='utf-8') as f:
19
+ text = f.read()
20
+ return text.split('\n\n')
21
+
22
+ def process_documents(file_path):
23
+ ext = os.path.splitext(file_path)[1].lower()
24
+ if ext == '.pdf':
25
+ return process_pdf(file_path)
26
+ elif ext == '.docx':
27
+ return process_docx(file_path)
28
+ elif ext == '.txt':
29
+ return process_txt(file_path)
30
+ else:
31
+ return []