Spaces:

Merlintxu
/

Chunkings

Sleeping

Chunkings / src /reader.py

Marcos Morales

modified: README.md

dd58f3d 24 days ago

1.21 kB

	"""Lectura de Markdown, DOCX y PDF con front‑matter opcional."""
	from pathlib import Path
	from typing import Tuple, Dict
	import re
	import yaml
	from docx import Document as DocxDocument
	import PyPDF2

	_FM = re.compile(r"^---\n(.?)\n---\n(.)$", re.DOTALL)

	def _split_fm(text: str) -> Tuple[Dict, str]:
	m = _FM.match(text)
	if m:
	meta_raw, body = m.groups()
	meta = yaml.safe_load(meta_raw) or {}
	return meta, body
	return {}, text

	def _read_md(path: Path) -> Tuple[Dict, str]:
	raw = path.read_text(encoding="utf-8")
	return _split_fm(raw)

	def _read_docx(path: Path) -> Tuple[Dict, str]:
	doc = DocxDocument(path)
	body = "\n".join(p.text for p in doc.paragraphs)
	return {}, body

	def _read_pdf(path: Path) -> Tuple[Dict, str]:
	r = PyPDF2.PdfReader(str(path))
	body = "\n".join(page.extract_text() or "" for page in r.pages)
	return {}, body

	def read_file(path: Path) -> Tuple[Dict, str]:
	ext = path.suffix.lower()
	if ext in {".md", ".markdown"}:
	return _read_md(path)
	if ext == ".docx":
	return _read_docx(path)
	if ext == ".pdf":
	return _read_pdf(path)
	raise ValueError(f"Formato no soportado: {ext}")