File size: 1,021 Bytes
99354e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import fitz
import docx
import re

def extract_texts_from_files(file):
    try:
        if file.name.endswith(".pdf"):
            doc = fitz.open(stream=file.read(), filetype="pdf")
            return "\n".join(page.get_text() for page in doc)
        elif file.name.endswith(".docx"):
            d = docx.Document(file)
            return "\n".join(p.text for p in d.paragraphs)
    except Exception:
        return ""

def clean_arabic(text):
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Arabic chars only
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def chunk_text(text, source="مصدر غير معروف", max_words=150):
    sentences = re.split(r'(?<=[.!؟])\s+', text)
    chunks = []
    current = []
    for sentence in sentences:
        current.append(sentence)
        if len(" ".join(current).split()) > max_words:
            chunks.append((" ".join(current), source))
            current = []
    if current:
        chunks.append((" ".join(current), source))
    return chunks