RagGV1 / utils.py
ramysaidagieb's picture
Upload 5 files
99354e0 verified
raw
history blame
1.02 kB
import fitz
import docx
import re
def extract_texts_from_files(file):
try:
if file.name.endswith(".pdf"):
doc = fitz.open(stream=file.read(), filetype="pdf")
return "\n".join(page.get_text() for page in doc)
elif file.name.endswith(".docx"):
d = docx.Document(file)
return "\n".join(p.text for p in d.paragraphs)
except Exception:
return ""
def clean_arabic(text):
text = re.sub(r'[^\u0600-\u06FF\s]', '', text) # Arabic chars only
text = re.sub(r'\s+', ' ', text).strip()
return text
def chunk_text(text, source="مصدر غير معروف", max_words=150):
sentences = re.split(r'(?<=[.!؟])\s+', text)
chunks = []
current = []
for sentence in sentences:
current.append(sentence)
if len(" ".join(current).split()) > max_words:
chunks.append((" ".join(current), source))
current = []
if current:
chunks.append((" ".join(current), source))
return chunks