Marcos Morales commited on
Commit
dd58f3d
·
1 Parent(s): 11af251

modified: README.md

Browse files

modified: app.py
modified: requirements.txt
new file: scripts/run_preprocess.sh
new file: src/__init__.py
new file: src/chunker.py
new file: src/config.py
new file: src/embeddings.py
new file: src/preprocess.py
new file: src/reader.py

Files changed (10) hide show
  1. README.md +10 -13
  2. app.py +27 -85
  3. requirements.txt +15 -6
  4. scripts/run_preprocess.sh +6 -0
  5. src/__init__.py +1 -0
  6. src/chunker.py +17 -0
  7. src/config.py +14 -0
  8. src/embeddings.py +19 -0
  9. src/preprocess.py +41 -0
  10. src/reader.py +41 -0
README.md CHANGED
@@ -1,14 +1,11 @@
1
- ---
2
- title: Chunkings
3
- emoji: 🏃
4
- colorFrom: indigo
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 5.38.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: Docs to chunks
12
- ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
1
+ # HF Vector Pipeline
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ Pipeline end‑to‑end para convertir documentos (`.md`, `.docx`, `.pdf`)
4
+ en un **JSONL con embeddings** y metadatos, listo para cargar en
5
+ **Amazon S3 Vector Features**.
6
+
7
+ Incluye:
8
+
9
+ * CLI (`python -m src.preprocess …`)
10
+ * UI Gradio (archivo **app.py**) preparada para HuggingFace Spaces
11
+ * Soporte para Windows 11 + VSCode
app.py CHANGED
@@ -1,97 +1,39 @@
1
  import gradio as gr
2
- import yaml
3
- import json
4
- import uuid
5
  from pathlib import Path
6
- from docx import Document
7
- import PyPDF2
8
- from sentence_transformers import SentenceTransformer
9
- import tiktoken
10
-
11
- model = SentenceTransformer('all-MiniLM-L6-v2')
12
- tokenizer = tiktoken.get_encoding("cl100k_base")
13
-
14
- def extract_front_matter_and_body(text: str):
15
- import re
16
- fm_regex = r"^---\n(.*?)\n---\n(.*)$"
17
- m = re.match(fm_regex, text, re.DOTALL)
18
- if m:
19
- meta = yaml.safe_load(m.group(1)) or {}
20
- body = m.group(2)
21
- else:
22
- meta = {}
23
- body = text
24
- return meta, body
25
-
26
- def chunk_text(text: str, max_tokens: int = 500, overlap: int = 50):
27
- tokens = tokenizer.encode(text)
28
- chunks = []
29
- start = 0
30
- while start < len(tokens):
31
- end = min(start + max_tokens, len(tokens))
32
- chunk_toks = tokens[start:end]
33
- chunks.append(tokenizer.decode(chunk_toks))
34
- start += max_tokens - overlap
35
- return chunks
36
-
37
- def process_file(path: str, vertical: str, language: str):
38
- ext = Path(path).suffix.lower()
39
- if ext in ['.md', '.markdown']:
40
- raw = Path(path).read_text(encoding='utf-8')
41
- meta, body = extract_front_matter_and_body(raw)
42
- elif ext == '.docx':
43
- doc = Document(path)
44
- body = "\n".join(p.text for p in doc.paragraphs)
45
- meta = {}
46
- elif ext == '.pdf':
47
- reader = PyPDF2.PdfReader(path)
48
- pages = [page.extract_text() or "" for page in reader.pages]
49
- body = "\n".join(pages)
50
- meta = {}
51
- else:
52
- return []
53
-
54
- default_meta = {
55
- 'vertical': vertical,
56
- 'language': language,
57
- 'source': Path(path).name
58
- }
59
- meta = {**default_meta, **meta}
60
- records = []
61
- for i, chunk in enumerate(chunk_text(body)):
62
- emb = model.encode(chunk).tolist()
63
- metadata = {
64
- 'id': f"{Path(path).stem}-chunk-{i+1:04d}",
65
- 'chunk_index': i+1,
66
- **meta
67
- }
68
- records.append({'vector': emb, 'metadata': metadata})
69
- return records
70
 
71
  def run_pipeline(files, vertical, language):
72
- all_records = []
73
  for file_path in files:
74
- recs = process_file(file_path, vertical, language)
75
- all_records.extend(recs)
76
-
77
- out_file = f"/tmp/{uuid.uuid4().hex}.jsonl"
78
- with open(out_file, 'w', encoding='utf-8') as f:
79
- for rec in all_records:
80
- json.dump({'id': rec['metadata']['id'], 'vector': rec['vector'], 'metadata': rec['metadata']}, f, ensure_ascii=False)
 
 
 
 
 
 
 
81
  f.write("\n")
82
- return out_file
83
 
84
- demo = gr.Blocks()
85
- with demo:
86
- gr.Markdown("## Ingesta para Amazon S3 Vector Features")
87
  with gr.Row():
88
- uploader = gr.File(label="Sube tus documentos", file_count="multiple", type="filepath")
89
- vertical = gr.Textbox(label="Vertical (p.ej. SEO, eCommerce)", value="general")
90
  language = gr.Textbox(label="Idioma", value="es")
91
- btn = gr.Button("Procesar y Generar JSONL")
92
- output = gr.File(label="Descarga el JSONL")
93
-
94
- btn.click(fn=run_pipeline, inputs=[uploader, vertical, language], outputs=output)
95
 
96
  if __name__ == "__main__":
97
  demo.launch()
 
1
  import gradio as gr
2
+ import yaml, json, uuid, os
 
 
3
  from pathlib import Path
4
+ from src.reader import read_file
5
+ from src.chunker import chunk_text
6
+ from src.embeddings import embed_texts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def run_pipeline(files, vertical, language):
9
+ recs = []
10
  for file_path in files:
11
+ meta, body = read_file(Path(file_path))
12
+ base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
13
+ chunks = chunk_text(body)
14
+ vecs = embed_texts(chunks)
15
+ for i, (chunk, vec) in enumerate(zip(chunks, vecs), 1):
16
+ recs.append({
17
+ "id": f"{Path(file_path).stem}-chunk-{i:04d}",
18
+ "vector": vec,
19
+ "metadata": {**base_meta, "chunk_index": i}
20
+ })
21
+ out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
22
+ with open(out_path, "w", encoding="utf-8") as f:
23
+ for r in recs:
24
+ json.dump(r, f, ensure_ascii=False)
25
  f.write("\n")
26
+ return out_path
27
 
28
+ with gr.Blocks() as demo:
29
+ gr.Markdown("## Ingesta para Amazon S3 Vector Features")
 
30
  with gr.Row():
31
+ uploader = gr.File(label="Sube documentos", file_count="multiple", type="filepath")
32
+ vertical = gr.Textbox(label="Vertical", value="general")
33
  language = gr.Textbox(label="Idioma", value="es")
34
+ btn = gr.Button("Procesar y generar JSONL")
35
+ outfile = gr.File(label="Descarga JSONL")
36
+ btn.click(run_pipeline, inputs=[uploader, vertical, language], outputs=outfile)
 
37
 
38
  if __name__ == "__main__":
39
  demo.launch()
requirements.txt CHANGED
@@ -1,6 +1,15 @@
1
- gradio
2
- pyyaml
3
- python-docx
4
- PyPDF2
5
- sentence-transformers
6
- tiktoken
 
 
 
 
 
 
 
 
 
 
1
+ # Core processing
2
+ pyyaml>=6.0
3
+ python-docx>=1.0
4
+ PyPDF2>=3.0
5
+ sentence-transformers>=2.7
6
+ tiktoken>=0.7
7
+
8
+ # CLI
9
+ click>=8.1
10
+
11
+ # UI (HuggingFace Space)
12
+ gradio>=4.32
13
+
14
+ # Opcional: variables de entorno
15
+ python-dotenv>=1.0
scripts/run_preprocess.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ python -m src.preprocess \
3
+ --input-dir sample_docs \
4
+ --output dist/output.jsonl \
5
+ --vertical SEO-LLM \
6
+ --language es
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Package marker."""
src/chunker.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Chunking token‑based."""
2
+ from typing import List
3
+ import tiktoken
4
+ from .config import CHUNK_SIZE, CHUNK_OVERLAP
5
+
6
+ _tok = tiktoken.get_encoding("cl100k_base")
7
+
8
+ def chunk_text(text: str,
9
+ max_tokens: int = CHUNK_SIZE,
10
+ overlap: int = CHUNK_OVERLAP) -> List[str]:
11
+ tokens = _tok.encode(text)
12
+ out, start, step = [], 0, max_tokens - overlap
13
+ while start < len(tokens):
14
+ end = min(start + max_tokens, len(tokens))
15
+ out.append(_tok.decode(tokens[start:end]))
16
+ start += step
17
+ return out
src/config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Carga de configuración y constantes globales."""
2
+ from pathlib import Path
3
+ from dotenv import load_dotenv
4
+ import os
5
+
6
+ ENV_PATH = Path(__file__).resolve().parent.parent / ".env"
7
+ if ENV_PATH.exists():
8
+ load_dotenv(ENV_PATH)
9
+
10
+ EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "intfloat/e5-large-v2")
11
+ DEVICE: str = os.getenv("DEVICE", "cpu")
12
+
13
+ CHUNK_SIZE: int = 500 # tokens por chunk
14
+ CHUNK_OVERLAP: int = 50 # solape entre chunks
src/embeddings.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SentenceTransformer wrapper."""
2
+ from typing import List
3
+ from sentence_transformers import SentenceTransformer
4
+ from .config import EMBEDDING_MODEL, DEVICE
5
+
6
+ _model: SentenceTransformer | None = None
7
+
8
+ def _model_instance() -> SentenceTransformer:
9
+ global _model
10
+ if _model is None:
11
+ _model = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
12
+ return _model
13
+
14
+ def embed_texts(texts: List[str]) -> List[List[float]]:
15
+ return _model_instance().encode(
16
+ texts,
17
+ show_progress_bar=False,
18
+ convert_to_numpy=False
19
+ ).tolist()
src/preprocess.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CLI: lee → chunkea → embed → JSONL."""
2
+ from pathlib import Path
3
+ from typing import Dict, List
4
+ import json
5
+ import click
6
+ from .reader import read_file
7
+ from .chunker import chunk_text
8
+ from .embeddings import embed_texts
9
+
10
+ @click.command()
11
+ @click.option("--input-dir", type=click.Path(exists=True, file_okay=False),
12
+ required=True, help="Carpeta con documentos.")
13
+ @click.option("--output", type=click.Path(), required=True,
14
+ help="Ruta del JSONL de salida.")
15
+ @click.option("--vertical", default="general", help="Vertical.")
16
+ @click.option("--language", default="es", help="Idioma.")
17
+ def main(input_dir: str, output: str, vertical: str, language: str):
18
+ recs: List[Dict] = []
19
+ for p in Path(input_dir).iterdir():
20
+ if not p.is_file():
21
+ continue
22
+ meta, body = read_file(p)
23
+ base_meta = {"vertical": vertical, "language": language, "source": p.name, **meta}
24
+ chunks = chunk_text(body)
25
+ vecs = embed_texts(chunks)
26
+ for i, (chunk, vec) in enumerate(zip(chunks, vecs), 1):
27
+ recs.append({
28
+ "id": f"{p.stem}-chunk-{i:04d}",
29
+ "vector": vec,
30
+ "metadata": {**base_meta, "chunk_index": i}
31
+ })
32
+ out = Path(output)
33
+ out.parent.mkdir(parents=True, exist_ok=True)
34
+ with out.open("w", encoding="utf-8") as f:
35
+ for r in recs:
36
+ json.dump(r, f, ensure_ascii=False)
37
+ f.write("\n")
38
+ click.echo(f"Wrote {len(recs)} records → {out}")
39
+
40
+ if __name__ == "__main__":
41
+ main()
src/reader.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Lectura de Markdown, DOCX y PDF con front‑matter opcional."""
2
+ from pathlib import Path
3
+ from typing import Tuple, Dict
4
+ import re
5
+ import yaml
6
+ from docx import Document as DocxDocument
7
+ import PyPDF2
8
+
9
+ _FM = re.compile(r"^---\n(.*?)\n---\n(.*)$", re.DOTALL)
10
+
11
+ def _split_fm(text: str) -> Tuple[Dict, str]:
12
+ m = _FM.match(text)
13
+ if m:
14
+ meta_raw, body = m.groups()
15
+ meta = yaml.safe_load(meta_raw) or {}
16
+ return meta, body
17
+ return {}, text
18
+
19
+ def _read_md(path: Path) -> Tuple[Dict, str]:
20
+ raw = path.read_text(encoding="utf-8")
21
+ return _split_fm(raw)
22
+
23
+ def _read_docx(path: Path) -> Tuple[Dict, str]:
24
+ doc = DocxDocument(path)
25
+ body = "\n".join(p.text for p in doc.paragraphs)
26
+ return {}, body
27
+
28
+ def _read_pdf(path: Path) -> Tuple[Dict, str]:
29
+ r = PyPDF2.PdfReader(str(path))
30
+ body = "\n".join(page.extract_text() or "" for page in r.pages)
31
+ return {}, body
32
+
33
+ def read_file(path: Path) -> Tuple[Dict, str]:
34
+ ext = path.suffix.lower()
35
+ if ext in {".md", ".markdown"}:
36
+ return _read_md(path)
37
+ if ext == ".docx":
38
+ return _read_docx(path)
39
+ if ext == ".pdf":
40
+ return _read_pdf(path)
41
+ raise ValueError(f"Formato no soportado: {ext}")