import fitz import docx import re def extract_texts_from_files(file): try: if file.name.endswith(".pdf"): doc = fitz.open(stream=file.read(), filetype="pdf") return "\n".join(page.get_text() for page in doc) elif file.name.endswith(".docx"): d = docx.Document(file) return "\n".join(p.text for p in d.paragraphs) except Exception: return "" def clean_arabic(text): text = re.sub(r'[^\u0600-\u06FF\s]', '', text) # Arabic chars only text = re.sub(r'\s+', ' ', text).strip() return text def chunk_text(text, source="مصدر غير معروف", max_words=150): sentences = re.split(r'(?<=[.!؟])\s+', text) chunks = [] current = [] for sentence in sentences: current.append(sentence) if len(" ".join(current).split()) > max_words: chunks.append((" ".join(current), source)) current = [] if current: chunks.append((" ".join(current), source)) return chunks