ramysaidagieb commited on
Commit
848b322
·
verified ·
1 Parent(s): 021d4f9

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +58 -14
utils.py CHANGED
@@ -1,17 +1,61 @@
1
- from PyPDF2 import PdfReader
2
  import docx
 
3
 
4
- def extract_text_from_files(files):
5
- all_text = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  for file in files:
7
- if file.name.endswith(".pdf"):
8
- reader = PdfReader(file)
9
- for page in reader.pages:
10
- all_text += page.extract_text() + "\n"
11
- elif file.name.endswith(".docx"):
12
- doc = docx.Document(file)
13
- for para in doc.paragraphs:
14
- all_text += para.text + "\n"
15
- elif file.name.endswith(".txt"):
16
- all_text += file.read().decode("utf-8") + "\n"
17
- return all_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import docx
3
+ import PyPDF2
4
 
5
+ def extract_text_from_pdf(file):
6
+ text = ""
7
+ try:
8
+ pdf_reader = PyPDF2.PdfReader(file)
9
+ for page in pdf_reader.pages:
10
+ text += page.extract_text() + "\n"
11
+ except Exception as e:
12
+ text += f"\n[خطأ في قراءة PDF: {e}]\n"
13
+ return text
14
+
15
+ def extract_text_from_docx(file):
16
+ doc = docx.Document(file)
17
+ return "\n".join([para.text for para in doc.paragraphs])
18
+
19
+ def extract_text_from_txt(file):
20
+ return file.read().decode("utf-8")
21
+
22
+ def chunk_text(text, chunk_size=300, overlap=50):
23
+ words = text.split()
24
+ chunks = []
25
+ start = 0
26
+ while start < len(words):
27
+ end = min(start + chunk_size, len(words))
28
+ chunk = " ".join(words[start:end])
29
+ chunks.append(chunk)
30
+ start += chunk_size - overlap
31
+ return chunks
32
+
33
+ def process_documents(files, log_callback=None):
34
+ all_chunks = []
35
  for file in files:
36
+ filename = os.path.basename(file.name)
37
+ ext = filename.split(".")[-1].lower()
38
+ if log_callback:
39
+ log_callback(f"📁 معالجة الملف: {filename}")
40
+
41
+ try:
42
+ if ext == "pdf":
43
+ text = extract_text_from_pdf(file)
44
+ elif ext == "docx":
45
+ text = extract_text_from_docx(file)
46
+ elif ext == "txt":
47
+ text = extract_text_from_txt(file)
48
+ else:
49
+ if log_callback:
50
+ log_callback(f"❗️ تنسيق غير مدعوم: {ext}")
51
+ continue
52
+
53
+ chunks = chunk_text(text)
54
+ all_chunks.extend(chunks)
55
+
56
+ if log_callback:
57
+ log_callback(f"✅ تم استخراج {len(chunks)} مقطع من {filename}")
58
+ except Exception as e:
59
+ if log_callback:
60
+ log_callback(f"⚠️ فشل في معالجة {filename}: {e}")
61
+ return all_chunks