Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -1,17 +1,61 @@
|
|
1 |
-
|
2 |
import docx
|
|
|
3 |
|
4 |
-
def
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
for file in files:
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
import docx
|
3 |
+
import PyPDF2
|
4 |
|
5 |
+
def extract_text_from_pdf(file):
|
6 |
+
text = ""
|
7 |
+
try:
|
8 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
9 |
+
for page in pdf_reader.pages:
|
10 |
+
text += page.extract_text() + "\n"
|
11 |
+
except Exception as e:
|
12 |
+
text += f"\n[خطأ في قراءة PDF: {e}]\n"
|
13 |
+
return text
|
14 |
+
|
15 |
+
def extract_text_from_docx(file):
|
16 |
+
doc = docx.Document(file)
|
17 |
+
return "\n".join([para.text for para in doc.paragraphs])
|
18 |
+
|
19 |
+
def extract_text_from_txt(file):
|
20 |
+
return file.read().decode("utf-8")
|
21 |
+
|
22 |
+
def chunk_text(text, chunk_size=300, overlap=50):
|
23 |
+
words = text.split()
|
24 |
+
chunks = []
|
25 |
+
start = 0
|
26 |
+
while start < len(words):
|
27 |
+
end = min(start + chunk_size, len(words))
|
28 |
+
chunk = " ".join(words[start:end])
|
29 |
+
chunks.append(chunk)
|
30 |
+
start += chunk_size - overlap
|
31 |
+
return chunks
|
32 |
+
|
33 |
+
def process_documents(files, log_callback=None):
|
34 |
+
all_chunks = []
|
35 |
for file in files:
|
36 |
+
filename = os.path.basename(file.name)
|
37 |
+
ext = filename.split(".")[-1].lower()
|
38 |
+
if log_callback:
|
39 |
+
log_callback(f"📁 معالجة الملف: {filename}")
|
40 |
+
|
41 |
+
try:
|
42 |
+
if ext == "pdf":
|
43 |
+
text = extract_text_from_pdf(file)
|
44 |
+
elif ext == "docx":
|
45 |
+
text = extract_text_from_docx(file)
|
46 |
+
elif ext == "txt":
|
47 |
+
text = extract_text_from_txt(file)
|
48 |
+
else:
|
49 |
+
if log_callback:
|
50 |
+
log_callback(f"❗️ تنسيق غير مدعوم: {ext}")
|
51 |
+
continue
|
52 |
+
|
53 |
+
chunks = chunk_text(text)
|
54 |
+
all_chunks.extend(chunks)
|
55 |
+
|
56 |
+
if log_callback:
|
57 |
+
log_callback(f"✅ تم استخراج {len(chunks)} مقطع من {filename}")
|
58 |
+
except Exception as e:
|
59 |
+
if log_callback:
|
60 |
+
log_callback(f"⚠️ فشل في معالجة {filename}: {e}")
|
61 |
+
return all_chunks
|