Spaces:
Sleeping
Sleeping
import fitz | |
import docx | |
import re | |
def extract_texts_from_files(file): | |
try: | |
if file.name.endswith(".pdf"): | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
return "\n".join(page.get_text() for page in doc) | |
elif file.name.endswith(".docx"): | |
d = docx.Document(file) | |
return "\n".join(p.text for p in d.paragraphs) | |
except Exception: | |
return "" | |
def clean_arabic(text): | |
text = re.sub(r'[^\u0600-\u06FF\s]', '', text) # Arabic chars only | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def chunk_text(text, source="مصدر غير معروف", max_words=150): | |
sentences = re.split(r'(?<=[.!؟])\s+', text) | |
chunks = [] | |
current = [] | |
for sentence in sentences: | |
current.append(sentence) | |
if len(" ".join(current).split()) > max_words: | |
chunks.append((" ".join(current), source)) | |
current = [] | |
if current: | |
chunks.append((" ".join(current), source)) | |
return chunks |