Spaces:

Merlintxu
/

Chunkings

Sleeping

Marcos Morales commited on about 1 month ago

Commit

dd58f3d

1 Parent(s): 11af251

modified: README.md

modified: app.py
modified: requirements.txt
new file: scripts/run_preprocess.sh
new file: src/__init__.py
new file: src/chunker.py
new file: src/config.py
new file: src/embeddings.py
new file: src/preprocess.py
new file: src/reader.py

Files changed (10) hide show

README.md +10 -13
app.py +27 -85
requirements.txt +15 -6
scripts/run_preprocess.sh +6 -0
src/__init__.py +1 -0
src/chunker.py +17 -0
src/config.py +14 -0
src/embeddings.py +19 -0
src/preprocess.py +41 -0
src/reader.py +41 -0

README.md CHANGED Viewed

@@ -1,14 +1,11 @@
----
-title: Chunkings
-emoji: 🏃
-colorFrom: indigo
-colorTo: blue
-sdk: gradio
-sdk_version: 5.38.0
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Docs to chunks
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# HF Vector Pipeline
+Pipeline end‑to‑end para convertir documentos (`.md`, `.docx`, `.pdf`)
+en un **JSONL con embeddings** y metadatos, listo para cargar en
+**Amazon S3 Vector Features**.
+Incluye:
+* CLI (`python -m src.preprocess …`)
+* UI Gradio (archivo **app.py**) preparada para HuggingFace Spaces
+* Soporte para Windows 11 + VSCode

app.py CHANGED Viewed

@@ -1,97 +1,39 @@
 import gradio as gr
-import yaml
-import json
-import uuid
 from pathlib import Path
-from docx import Document
-import PyPDF2
-from sentence_transformers import SentenceTransformer
-import tiktoken
-model = SentenceTransformer('all-MiniLM-L6-v2')
-tokenizer = tiktoken.get_encoding("cl100k_base")
-def extract_front_matter_and_body(text: str):
-    import re
-    fm_regex = r"^---\n(.*?)\n---\n(.*)$"
-    m = re.match(fm_regex, text, re.DOTALL)
-    if m:
-        meta = yaml.safe_load(m.group(1)) or {}
-        body = m.group(2)
-    else:
-        meta = {}
-        body = text
-    return meta, body
-def chunk_text(text: str, max_tokens: int = 500, overlap: int = 50):
-    tokens = tokenizer.encode(text)
-    chunks = []
-    start = 0
-    while start < len(tokens):
-        end = min(start + max_tokens, len(tokens))
-        chunk_toks = tokens[start:end]
-        chunks.append(tokenizer.decode(chunk_toks))
-        start += max_tokens - overlap
-    return chunks
-def process_file(path: str, vertical: str, language: str):
-    ext = Path(path).suffix.lower()
-    if ext in ['.md', '.markdown']:
-        raw = Path(path).read_text(encoding='utf-8')
-        meta, body = extract_front_matter_and_body(raw)
-    elif ext == '.docx':
-        doc = Document(path)
-        body = "\n".join(p.text for p in doc.paragraphs)
-        meta = {}
-    elif ext == '.pdf':
-        reader = PyPDF2.PdfReader(path)
-        pages = [page.extract_text() or "" for page in reader.pages]
-        body = "\n".join(pages)
-        meta = {}
-    else:
-        return []
-    default_meta = {
-        'vertical': vertical,
-        'language': language,
-        'source': Path(path).name
-    }
-    meta = {**default_meta, **meta}
-    records = []
-    for i, chunk in enumerate(chunk_text(body)):
-        emb = model.encode(chunk).tolist()
-        metadata = {
-            'id': f"{Path(path).stem}-chunk-{i+1:04d}",
-            'chunk_index': i+1,
-            **meta
-        }
-        records.append({'vector': emb, 'metadata': metadata})
-    return records
 def run_pipeline(files, vertical, language):
-    all_records = []
     for file_path in files:
-        recs = process_file(file_path, vertical, language)
-        all_records.extend(recs)
-    out_file = f"/tmp/{uuid.uuid4().hex}.jsonl"
-    with open(out_file, 'w', encoding='utf-8') as f:
-        for rec in all_records:
-            json.dump({'id': rec['metadata']['id'], 'vector': rec['vector'], 'metadata': rec['metadata']}, f, ensure_ascii=False)
             f.write("\n")
-    return out_file
-demo = gr.Blocks()
-with demo:
-    gr.Markdown("## Ingesta para Amazon S3 Vector Features")
     with gr.Row():
-        uploader = gr.File(label="Sube tus documentos", file_count="multiple", type="filepath")
-        vertical = gr.Textbox(label="Vertical (p.ej. SEO, eCommerce)", value="general")
         language = gr.Textbox(label="Idioma", value="es")
-    btn = gr.Button("Procesar y Generar JSONL")
-    output = gr.File(label="Descarga el JSONL")
-    btn.click(fn=run_pipeline, inputs=[uploader, vertical, language], outputs=output)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import yaml, json, uuid, os
 from pathlib import Path
+from src.reader import read_file
+from src.chunker import chunk_text
+from src.embeddings import embed_texts
 def run_pipeline(files, vertical, language):
+    recs = []
     for file_path in files:
+        meta, body = read_file(Path(file_path))
+        base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
+        chunks = chunk_text(body)
+        vecs = embed_texts(chunks)
+        for i, (chunk, vec) in enumerate(zip(chunks, vecs), 1):
+            recs.append({
+                "id": f"{Path(file_path).stem}-chunk-{i:04d}",
+                "vector": vec,
+                "metadata": {**base_meta, "chunk_index": i}
+            })
+    out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
+    with open(out_path, "w", encoding="utf-8") as f:
+        for r in recs:
+            json.dump(r, f, ensure_ascii=False)
             f.write("\n")
+    return out_path
+with gr.Blocks() as demo:
+    gr.Markdown("## Ingesta para Amazon S3 Vector Features")
     with gr.Row():
+        uploader = gr.File(label="Sube documentos", file_count="multiple", type="filepath")
+        vertical = gr.Textbox(label="Vertical", value="general")
         language = gr.Textbox(label="Idioma", value="es")
+    btn = gr.Button("Procesar y generar JSONL")
+    outfile = gr.File(label="Descarga JSONL")
+    btn.click(run_pipeline, inputs=[uploader, vertical, language], outputs=outfile)
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -1,6 +1,15 @@
-gradio
-pyyaml
-python-docx
-PyPDF2
-sentence-transformers
-tiktoken

+# Core processing
+pyyaml>=6.0
+python-docx>=1.0
+PyPDF2>=3.0
+sentence-transformers>=2.7
+tiktoken>=0.7
+# CLI
+click>=8.1
+# UI (HuggingFace Space)
+gradio>=4.32
+# Opcional: variables de entorno
+python-dotenv>=1.0

scripts/run_preprocess.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/env bash
+python -m src.preprocess \
+  --input-dir sample_docs \
+  --output dist/output.jsonl \
+  --vertical SEO-LLM \
+  --language es

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Package marker."""

src/chunker.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""Chunking token‑based."""
+from typing import List
+import tiktoken
+from .config import CHUNK_SIZE, CHUNK_OVERLAP
+_tok = tiktoken.get_encoding("cl100k_base")
+def chunk_text(text: str,
+               max_tokens: int = CHUNK_SIZE,
+               overlap: int = CHUNK_OVERLAP) -> List[str]:
+    tokens = _tok.encode(text)
+    out, start, step = [], 0, max_tokens - overlap
+    while start < len(tokens):
+        end = min(start + max_tokens, len(tokens))
+        out.append(_tok.decode(tokens[start:end]))
+        start += step
+    return out

src/config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Carga de configuración y constantes globales."""
+from pathlib import Path
+from dotenv import load_dotenv
+import os
+ENV_PATH = Path(__file__).resolve().parent.parent / ".env"
+if ENV_PATH.exists():
+    load_dotenv(ENV_PATH)
+EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "intfloat/e5-large-v2")
+DEVICE: str = os.getenv("DEVICE", "cpu")
+CHUNK_SIZE: int = 500     # tokens por chunk
+CHUNK_OVERLAP: int = 50   # solape entre chunks

src/embeddings.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""SentenceTransformer wrapper."""
+from typing import List
+from sentence_transformers import SentenceTransformer
+from .config import EMBEDDING_MODEL, DEVICE
+_model: SentenceTransformer | None = None
+def _model_instance() -> SentenceTransformer:
+    global _model
+    if _model is None:
+        _model = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
+    return _model
+def embed_texts(texts: List[str]) -> List[List[float]]:
+    return _model_instance().encode(
+        texts,
+        show_progress_bar=False,
+        convert_to_numpy=False
+    ).tolist()

src/preprocess.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""CLI: lee → chunkea → embed → JSONL."""
+from pathlib import Path
+from typing import Dict, List
+import json
+import click
+from .reader import read_file
+from .chunker import chunk_text
+from .embeddings import embed_texts
+@click.command()
+@click.option("--input-dir", type=click.Path(exists=True, file_okay=False),
+              required=True, help="Carpeta con documentos.")
+@click.option("--output", type=click.Path(), required=True,
+              help="Ruta del JSONL de salida.")
+@click.option("--vertical", default="general", help="Vertical.")
+@click.option("--language", default="es", help="Idioma.")
+def main(input_dir: str, output: str, vertical: str, language: str):
+    recs: List[Dict] = []
+    for p in Path(input_dir).iterdir():
+        if not p.is_file():
+            continue
+        meta, body = read_file(p)
+        base_meta = {"vertical": vertical, "language": language, "source": p.name, **meta}
+        chunks = chunk_text(body)
+        vecs = embed_texts(chunks)
+        for i, (chunk, vec) in enumerate(zip(chunks, vecs), 1):
+            recs.append({
+                "id": f"{p.stem}-chunk-{i:04d}",
+                "vector": vec,
+                "metadata": {**base_meta, "chunk_index": i}
+            })
+    out = Path(output)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with out.open("w", encoding="utf-8") as f:
+        for r in recs:
+            json.dump(r, f, ensure_ascii=False)
+            f.write("\n")
+    click.echo(f"Wrote {len(recs)} records → {out}")
+if __name__ == "__main__":
+    main()

src/reader.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Lectura de Markdown, DOCX y PDF con front‑matter opcional."""
+from pathlib import Path
+from typing import Tuple, Dict
+import re
+import yaml
+from docx import Document as DocxDocument
+import PyPDF2
+_FM = re.compile(r"^---\n(.*?)\n---\n(.*)$", re.DOTALL)
+def _split_fm(text: str) -> Tuple[Dict, str]:
+    m = _FM.match(text)
+    if m:
+        meta_raw, body = m.groups()
+        meta = yaml.safe_load(meta_raw) or {}
+        return meta, body
+    return {}, text
+def _read_md(path: Path) -> Tuple[Dict, str]:
+    raw = path.read_text(encoding="utf-8")
+    return _split_fm(raw)
+def _read_docx(path: Path) -> Tuple[Dict, str]:
+    doc = DocxDocument(path)
+    body = "\n".join(p.text for p in doc.paragraphs)
+    return {}, body
+def _read_pdf(path: Path) -> Tuple[Dict, str]:
+    r = PyPDF2.PdfReader(str(path))
+    body = "\n".join(page.extract_text() or "" for page in r.pages)
+    return {}, body
+def read_file(path: Path) -> Tuple[Dict, str]:
+    ext = path.suffix.lower()
+    if ext in {".md", ".markdown"}:
+        return _read_md(path)
+    if ext == ".docx":
+        return _read_docx(path)
+    if ext == ".pdf":
+        return _read_pdf(path)
+    raise ValueError(f"Formato no soportado: {ext}")