Chunkings / src /reader.py
Marcos Morales
modified: README.md
dd58f3d
"""Lectura de Markdown, DOCX y PDF con front‑matter opcional."""
from pathlib import Path
from typing import Tuple, Dict
import re
import yaml
from docx import Document as DocxDocument
import PyPDF2
_FM = re.compile(r"^---\n(.*?)\n---\n(.*)$", re.DOTALL)
def _split_fm(text: str) -> Tuple[Dict, str]:
m = _FM.match(text)
if m:
meta_raw, body = m.groups()
meta = yaml.safe_load(meta_raw) or {}
return meta, body
return {}, text
def _read_md(path: Path) -> Tuple[Dict, str]:
raw = path.read_text(encoding="utf-8")
return _split_fm(raw)
def _read_docx(path: Path) -> Tuple[Dict, str]:
doc = DocxDocument(path)
body = "\n".join(p.text for p in doc.paragraphs)
return {}, body
def _read_pdf(path: Path) -> Tuple[Dict, str]:
r = PyPDF2.PdfReader(str(path))
body = "\n".join(page.extract_text() or "" for page in r.pages)
return {}, body
def read_file(path: Path) -> Tuple[Dict, str]:
ext = path.suffix.lower()
if ext in {".md", ".markdown"}:
return _read_md(path)
if ext == ".docx":
return _read_docx(path)
if ext == ".pdf":
return _read_pdf(path)
raise ValueError(f"Formato no soportado: {ext}")