File size: 1,211 Bytes
dd58f3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
"""Lectura de Markdown, DOCX y PDF con front‑matter opcional."""
from pathlib import Path
from typing import Tuple, Dict
import re
import yaml
from docx import Document as DocxDocument
import PyPDF2

_FM = re.compile(r"^---\n(.*?)\n---\n(.*)$", re.DOTALL)

def _split_fm(text: str) -> Tuple[Dict, str]:
    m = _FM.match(text)
    if m:
        meta_raw, body = m.groups()
        meta = yaml.safe_load(meta_raw) or {}
        return meta, body
    return {}, text

def _read_md(path: Path) -> Tuple[Dict, str]:
    raw = path.read_text(encoding="utf-8")
    return _split_fm(raw)

def _read_docx(path: Path) -> Tuple[Dict, str]:
    doc = DocxDocument(path)
    body = "\n".join(p.text for p in doc.paragraphs)
    return {}, body

def _read_pdf(path: Path) -> Tuple[Dict, str]:
    r = PyPDF2.PdfReader(str(path))
    body = "\n".join(page.extract_text() or "" for page in r.pages)
    return {}, body

def read_file(path: Path) -> Tuple[Dict, str]:
    ext = path.suffix.lower()
    if ext in {".md", ".markdown"}:
        return _read_md(path)
    if ext == ".docx":
        return _read_docx(path)
    if ext == ".pdf":
        return _read_pdf(path)
    raise ValueError(f"Formato no soportado: {ext}")