Marcos Morales
commited on
Commit
·
dd58f3d
1
Parent(s):
11af251
modified: README.md
Browse filesmodified: app.py
modified: requirements.txt
new file: scripts/run_preprocess.sh
new file: src/__init__.py
new file: src/chunker.py
new file: src/config.py
new file: src/embeddings.py
new file: src/preprocess.py
new file: src/reader.py
- README.md +10 -13
- app.py +27 -85
- requirements.txt +15 -6
- scripts/run_preprocess.sh +6 -0
- src/__init__.py +1 -0
- src/chunker.py +17 -0
- src/config.py +14 -0
- src/embeddings.py +19 -0
- src/preprocess.py +41 -0
- src/reader.py +41 -0
README.md
CHANGED
@@ -1,14 +1,11 @@
|
|
1 |
-
|
2 |
-
title: Chunkings
|
3 |
-
emoji: 🏃
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: blue
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.38.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
-
short_description: Docs to chunks
|
12 |
-
---
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# HF Vector Pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
Pipeline end‑to‑end para convertir documentos (`.md`, `.docx`, `.pdf`)
|
4 |
+
en un **JSONL con embeddings** y metadatos, listo para cargar en
|
5 |
+
**Amazon S3 Vector Features**.
|
6 |
+
|
7 |
+
Incluye:
|
8 |
+
|
9 |
+
* CLI (`python -m src.preprocess …`)
|
10 |
+
* UI Gradio (archivo **app.py**) preparada para HuggingFace Spaces
|
11 |
+
* Soporte para Windows 11 + VSCode
|
app.py
CHANGED
@@ -1,97 +1,39 @@
|
|
1 |
import gradio as gr
|
2 |
-
import yaml
|
3 |
-
import json
|
4 |
-
import uuid
|
5 |
from pathlib import Path
|
6 |
-
from
|
7 |
-
import
|
8 |
-
from
|
9 |
-
import tiktoken
|
10 |
-
|
11 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
12 |
-
tokenizer = tiktoken.get_encoding("cl100k_base")
|
13 |
-
|
14 |
-
def extract_front_matter_and_body(text: str):
|
15 |
-
import re
|
16 |
-
fm_regex = r"^---\n(.*?)\n---\n(.*)$"
|
17 |
-
m = re.match(fm_regex, text, re.DOTALL)
|
18 |
-
if m:
|
19 |
-
meta = yaml.safe_load(m.group(1)) or {}
|
20 |
-
body = m.group(2)
|
21 |
-
else:
|
22 |
-
meta = {}
|
23 |
-
body = text
|
24 |
-
return meta, body
|
25 |
-
|
26 |
-
def chunk_text(text: str, max_tokens: int = 500, overlap: int = 50):
|
27 |
-
tokens = tokenizer.encode(text)
|
28 |
-
chunks = []
|
29 |
-
start = 0
|
30 |
-
while start < len(tokens):
|
31 |
-
end = min(start + max_tokens, len(tokens))
|
32 |
-
chunk_toks = tokens[start:end]
|
33 |
-
chunks.append(tokenizer.decode(chunk_toks))
|
34 |
-
start += max_tokens - overlap
|
35 |
-
return chunks
|
36 |
-
|
37 |
-
def process_file(path: str, vertical: str, language: str):
|
38 |
-
ext = Path(path).suffix.lower()
|
39 |
-
if ext in ['.md', '.markdown']:
|
40 |
-
raw = Path(path).read_text(encoding='utf-8')
|
41 |
-
meta, body = extract_front_matter_and_body(raw)
|
42 |
-
elif ext == '.docx':
|
43 |
-
doc = Document(path)
|
44 |
-
body = "\n".join(p.text for p in doc.paragraphs)
|
45 |
-
meta = {}
|
46 |
-
elif ext == '.pdf':
|
47 |
-
reader = PyPDF2.PdfReader(path)
|
48 |
-
pages = [page.extract_text() or "" for page in reader.pages]
|
49 |
-
body = "\n".join(pages)
|
50 |
-
meta = {}
|
51 |
-
else:
|
52 |
-
return []
|
53 |
-
|
54 |
-
default_meta = {
|
55 |
-
'vertical': vertical,
|
56 |
-
'language': language,
|
57 |
-
'source': Path(path).name
|
58 |
-
}
|
59 |
-
meta = {**default_meta, **meta}
|
60 |
-
records = []
|
61 |
-
for i, chunk in enumerate(chunk_text(body)):
|
62 |
-
emb = model.encode(chunk).tolist()
|
63 |
-
metadata = {
|
64 |
-
'id': f"{Path(path).stem}-chunk-{i+1:04d}",
|
65 |
-
'chunk_index': i+1,
|
66 |
-
**meta
|
67 |
-
}
|
68 |
-
records.append({'vector': emb, 'metadata': metadata})
|
69 |
-
return records
|
70 |
|
71 |
def run_pipeline(files, vertical, language):
|
72 |
-
|
73 |
for file_path in files:
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
f.write("\n")
|
82 |
-
return
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
gr.Markdown("## Ingesta para Amazon S3 Vector Features")
|
87 |
with gr.Row():
|
88 |
-
uploader = gr.File(label="Sube
|
89 |
-
vertical = gr.Textbox(label="Vertical
|
90 |
language = gr.Textbox(label="Idioma", value="es")
|
91 |
-
btn = gr.Button("Procesar y
|
92 |
-
|
93 |
-
|
94 |
-
btn.click(fn=run_pipeline, inputs=[uploader, vertical, language], outputs=output)
|
95 |
|
96 |
if __name__ == "__main__":
|
97 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import yaml, json, uuid, os
|
|
|
|
|
3 |
from pathlib import Path
|
4 |
+
from src.reader import read_file
|
5 |
+
from src.chunker import chunk_text
|
6 |
+
from src.embeddings import embed_texts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def run_pipeline(files, vertical, language):
|
9 |
+
recs = []
|
10 |
for file_path in files:
|
11 |
+
meta, body = read_file(Path(file_path))
|
12 |
+
base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
|
13 |
+
chunks = chunk_text(body)
|
14 |
+
vecs = embed_texts(chunks)
|
15 |
+
for i, (chunk, vec) in enumerate(zip(chunks, vecs), 1):
|
16 |
+
recs.append({
|
17 |
+
"id": f"{Path(file_path).stem}-chunk-{i:04d}",
|
18 |
+
"vector": vec,
|
19 |
+
"metadata": {**base_meta, "chunk_index": i}
|
20 |
+
})
|
21 |
+
out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
|
22 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
23 |
+
for r in recs:
|
24 |
+
json.dump(r, f, ensure_ascii=False)
|
25 |
f.write("\n")
|
26 |
+
return out_path
|
27 |
|
28 |
+
with gr.Blocks() as demo:
|
29 |
+
gr.Markdown("## Ingesta para Amazon S3 Vector Features")
|
|
|
30 |
with gr.Row():
|
31 |
+
uploader = gr.File(label="Sube documentos", file_count="multiple", type="filepath")
|
32 |
+
vertical = gr.Textbox(label="Vertical", value="general")
|
33 |
language = gr.Textbox(label="Idioma", value="es")
|
34 |
+
btn = gr.Button("Procesar y generar JSONL")
|
35 |
+
outfile = gr.File(label="Descarga JSONL")
|
36 |
+
btn.click(run_pipeline, inputs=[uploader, vertical, language], outputs=outfile)
|
|
|
37 |
|
38 |
if __name__ == "__main__":
|
39 |
demo.launch()
|
requirements.txt
CHANGED
@@ -1,6 +1,15 @@
|
|
1 |
-
|
2 |
-
pyyaml
|
3 |
-
python-docx
|
4 |
-
PyPDF2
|
5 |
-
sentence-transformers
|
6 |
-
tiktoken
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core processing
|
2 |
+
pyyaml>=6.0
|
3 |
+
python-docx>=1.0
|
4 |
+
PyPDF2>=3.0
|
5 |
+
sentence-transformers>=2.7
|
6 |
+
tiktoken>=0.7
|
7 |
+
|
8 |
+
# CLI
|
9 |
+
click>=8.1
|
10 |
+
|
11 |
+
# UI (HuggingFace Space)
|
12 |
+
gradio>=4.32
|
13 |
+
|
14 |
+
# Opcional: variables de entorno
|
15 |
+
python-dotenv>=1.0
|
scripts/run_preprocess.sh
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
python -m src.preprocess \
|
3 |
+
--input-dir sample_docs \
|
4 |
+
--output dist/output.jsonl \
|
5 |
+
--vertical SEO-LLM \
|
6 |
+
--language es
|
src/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""Package marker."""
|
src/chunker.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Chunking token‑based."""
|
2 |
+
from typing import List
|
3 |
+
import tiktoken
|
4 |
+
from .config import CHUNK_SIZE, CHUNK_OVERLAP
|
5 |
+
|
6 |
+
_tok = tiktoken.get_encoding("cl100k_base")
|
7 |
+
|
8 |
+
def chunk_text(text: str,
|
9 |
+
max_tokens: int = CHUNK_SIZE,
|
10 |
+
overlap: int = CHUNK_OVERLAP) -> List[str]:
|
11 |
+
tokens = _tok.encode(text)
|
12 |
+
out, start, step = [], 0, max_tokens - overlap
|
13 |
+
while start < len(tokens):
|
14 |
+
end = min(start + max_tokens, len(tokens))
|
15 |
+
out.append(_tok.decode(tokens[start:end]))
|
16 |
+
start += step
|
17 |
+
return out
|
src/config.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Carga de configuración y constantes globales."""
|
2 |
+
from pathlib import Path
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
import os
|
5 |
+
|
6 |
+
ENV_PATH = Path(__file__).resolve().parent.parent / ".env"
|
7 |
+
if ENV_PATH.exists():
|
8 |
+
load_dotenv(ENV_PATH)
|
9 |
+
|
10 |
+
EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "intfloat/e5-large-v2")
|
11 |
+
DEVICE: str = os.getenv("DEVICE", "cpu")
|
12 |
+
|
13 |
+
CHUNK_SIZE: int = 500 # tokens por chunk
|
14 |
+
CHUNK_OVERLAP: int = 50 # solape entre chunks
|
src/embeddings.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""SentenceTransformer wrapper."""
|
2 |
+
from typing import List
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
from .config import EMBEDDING_MODEL, DEVICE
|
5 |
+
|
6 |
+
_model: SentenceTransformer | None = None
|
7 |
+
|
8 |
+
def _model_instance() -> SentenceTransformer:
|
9 |
+
global _model
|
10 |
+
if _model is None:
|
11 |
+
_model = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
|
12 |
+
return _model
|
13 |
+
|
14 |
+
def embed_texts(texts: List[str]) -> List[List[float]]:
|
15 |
+
return _model_instance().encode(
|
16 |
+
texts,
|
17 |
+
show_progress_bar=False,
|
18 |
+
convert_to_numpy=False
|
19 |
+
).tolist()
|
src/preprocess.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""CLI: lee → chunkea → embed → JSONL."""
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Dict, List
|
4 |
+
import json
|
5 |
+
import click
|
6 |
+
from .reader import read_file
|
7 |
+
from .chunker import chunk_text
|
8 |
+
from .embeddings import embed_texts
|
9 |
+
|
10 |
+
@click.command()
|
11 |
+
@click.option("--input-dir", type=click.Path(exists=True, file_okay=False),
|
12 |
+
required=True, help="Carpeta con documentos.")
|
13 |
+
@click.option("--output", type=click.Path(), required=True,
|
14 |
+
help="Ruta del JSONL de salida.")
|
15 |
+
@click.option("--vertical", default="general", help="Vertical.")
|
16 |
+
@click.option("--language", default="es", help="Idioma.")
|
17 |
+
def main(input_dir: str, output: str, vertical: str, language: str):
|
18 |
+
recs: List[Dict] = []
|
19 |
+
for p in Path(input_dir).iterdir():
|
20 |
+
if not p.is_file():
|
21 |
+
continue
|
22 |
+
meta, body = read_file(p)
|
23 |
+
base_meta = {"vertical": vertical, "language": language, "source": p.name, **meta}
|
24 |
+
chunks = chunk_text(body)
|
25 |
+
vecs = embed_texts(chunks)
|
26 |
+
for i, (chunk, vec) in enumerate(zip(chunks, vecs), 1):
|
27 |
+
recs.append({
|
28 |
+
"id": f"{p.stem}-chunk-{i:04d}",
|
29 |
+
"vector": vec,
|
30 |
+
"metadata": {**base_meta, "chunk_index": i}
|
31 |
+
})
|
32 |
+
out = Path(output)
|
33 |
+
out.parent.mkdir(parents=True, exist_ok=True)
|
34 |
+
with out.open("w", encoding="utf-8") as f:
|
35 |
+
for r in recs:
|
36 |
+
json.dump(r, f, ensure_ascii=False)
|
37 |
+
f.write("\n")
|
38 |
+
click.echo(f"Wrote {len(recs)} records → {out}")
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
main()
|
src/reader.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Lectura de Markdown, DOCX y PDF con front‑matter opcional."""
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Tuple, Dict
|
4 |
+
import re
|
5 |
+
import yaml
|
6 |
+
from docx import Document as DocxDocument
|
7 |
+
import PyPDF2
|
8 |
+
|
9 |
+
_FM = re.compile(r"^---\n(.*?)\n---\n(.*)$", re.DOTALL)
|
10 |
+
|
11 |
+
def _split_fm(text: str) -> Tuple[Dict, str]:
|
12 |
+
m = _FM.match(text)
|
13 |
+
if m:
|
14 |
+
meta_raw, body = m.groups()
|
15 |
+
meta = yaml.safe_load(meta_raw) or {}
|
16 |
+
return meta, body
|
17 |
+
return {}, text
|
18 |
+
|
19 |
+
def _read_md(path: Path) -> Tuple[Dict, str]:
|
20 |
+
raw = path.read_text(encoding="utf-8")
|
21 |
+
return _split_fm(raw)
|
22 |
+
|
23 |
+
def _read_docx(path: Path) -> Tuple[Dict, str]:
|
24 |
+
doc = DocxDocument(path)
|
25 |
+
body = "\n".join(p.text for p in doc.paragraphs)
|
26 |
+
return {}, body
|
27 |
+
|
28 |
+
def _read_pdf(path: Path) -> Tuple[Dict, str]:
|
29 |
+
r = PyPDF2.PdfReader(str(path))
|
30 |
+
body = "\n".join(page.extract_text() or "" for page in r.pages)
|
31 |
+
return {}, body
|
32 |
+
|
33 |
+
def read_file(path: Path) -> Tuple[Dict, str]:
|
34 |
+
ext = path.suffix.lower()
|
35 |
+
if ext in {".md", ".markdown"}:
|
36 |
+
return _read_md(path)
|
37 |
+
if ext == ".docx":
|
38 |
+
return _read_docx(path)
|
39 |
+
if ext == ".pdf":
|
40 |
+
return _read_pdf(path)
|
41 |
+
raise ValueError(f"Formato no soportado: {ext}")
|