Spaces:

ramysaidagieb
/

RagGV1

Sleeping

RagGV1 / utils.py

Upload 5 files

99354e0 verified 3 months ago

1.02 kB

	import fitz
	import docx
	import re

	def extract_texts_from_files(file):
	try:
	if file.name.endswith(".pdf"):
	doc = fitz.open(stream=file.read(), filetype="pdf")
	return "\n".join(page.get_text() for page in doc)
	elif file.name.endswith(".docx"):
	d = docx.Document(file)
	return "\n".join(p.text for p in d.paragraphs)
	except Exception:
	return ""

	def clean_arabic(text):
	text = re.sub(r'[^\u0600-\u06FF\s]', '', text) # Arabic chars only
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def chunk_text(text, source="مصدر غير معروف", max_words=150):
	sentences = re.split(r'(?<=[.!؟])\s+', text)
	chunks = []
	current = []
	for sentence in sentences:
	current.append(sentence)
	if len(" ".join(current).split()) > max_words:
	chunks.append((" ".join(current), source))
	current = []
	if current:
	chunks.append((" ".join(current), source))
	return chunks