|
"""Lectura de Markdown, DOCX y PDF con front‑matter opcional.""" |
|
from pathlib import Path |
|
from typing import Tuple, Dict |
|
import re |
|
import yaml |
|
from docx import Document as DocxDocument |
|
import PyPDF2 |
|
|
|
_FM = re.compile(r"^---\n(.*?)\n---\n(.*)$", re.DOTALL) |
|
|
|
def _split_fm(text: str) -> Tuple[Dict, str]: |
|
m = _FM.match(text) |
|
if m: |
|
meta_raw, body = m.groups() |
|
meta = yaml.safe_load(meta_raw) or {} |
|
return meta, body |
|
return {}, text |
|
|
|
def _read_md(path: Path) -> Tuple[Dict, str]: |
|
raw = path.read_text(encoding="utf-8") |
|
return _split_fm(raw) |
|
|
|
def _read_docx(path: Path) -> Tuple[Dict, str]: |
|
doc = DocxDocument(path) |
|
body = "\n".join(p.text for p in doc.paragraphs) |
|
return {}, body |
|
|
|
def _read_pdf(path: Path) -> Tuple[Dict, str]: |
|
r = PyPDF2.PdfReader(str(path)) |
|
body = "\n".join(page.extract_text() or "" for page in r.pages) |
|
return {}, body |
|
|
|
def read_file(path: Path) -> Tuple[Dict, str]: |
|
ext = path.suffix.lower() |
|
if ext in {".md", ".markdown"}: |
|
return _read_md(path) |
|
if ext == ".docx": |
|
return _read_docx(path) |
|
if ext == ".pdf": |
|
return _read_pdf(path) |
|
raise ValueError(f"Formato no soportado: {ext}") |
|
|