File size: 1,898 Bytes
848b322
4254fda
848b322
4254fda
848b322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4254fda
848b322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import docx
import PyPDF2

def extract_text_from_pdf(file):
    text = ""
    try:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    except Exception as e:
        text += f"\n[خطأ في قراءة PDF: {e}]\n"
    return text

def extract_text_from_docx(file):
    doc = docx.Document(file)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_txt(file):
    return file.read().decode("utf-8")

def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

def process_documents(files, log_callback=None):
    all_chunks = []
    for file in files:
        filename = os.path.basename(file.name)
        ext = filename.split(".")[-1].lower()
        if log_callback:
            log_callback(f"📁 معالجة الملف: {filename}")

        try:
            if ext == "pdf":
                text = extract_text_from_pdf(file)
            elif ext == "docx":
                text = extract_text_from_docx(file)
            elif ext == "txt":
                text = extract_text_from_txt(file)
            else:
                if log_callback:
                    log_callback(f"❗️ تنسيق غير مدعوم: {ext}")
                continue

            chunks = chunk_text(text)
            all_chunks.extend(chunks)

            if log_callback:
                log_callback(f"✅ تم استخراج {len(chunks)} مقطع من {filename}")
        except Exception as e:
            if log_callback:
                log_callback(f"⚠️ فشل في معالجة {filename}: {e}")
    return all_chunks