"""Lectura de Markdown, DOCX y PDF con front‑matter opcional.""" from pathlib import Path from typing import Tuple, Dict import re import yaml from docx import Document as DocxDocument import PyPDF2 _FM = re.compile(r"^---\n(.*?)\n---\n(.*)$", re.DOTALL) def _split_fm(text: str) -> Tuple[Dict, str]: m = _FM.match(text) if m: meta_raw, body = m.groups() meta = yaml.safe_load(meta_raw) or {} return meta, body return {}, text def _read_md(path: Path) -> Tuple[Dict, str]: raw = path.read_text(encoding="utf-8") return _split_fm(raw) def _read_docx(path: Path) -> Tuple[Dict, str]: doc = DocxDocument(path) body = "\n".join(p.text for p in doc.paragraphs) return {}, body def _read_pdf(path: Path) -> Tuple[Dict, str]: r = PyPDF2.PdfReader(str(path)) body = "\n".join(page.extract_text() or "" for page in r.pages) return {}, body def read_file(path: Path) -> Tuple[Dict, str]: ext = path.suffix.lower() if ext in {".md", ".markdown"}: return _read_md(path) if ext == ".docx": return _read_docx(path) if ext == ".pdf": return _read_pdf(path) raise ValueError(f"Formato no soportado: {ext}")