Chunkings / src /preprocess.py
Marcos Morales
modified: app.py
9710d1d
"""CLI entry‑point: read files → chunk → embed → enrich metadata → JSONL."""
from pathlib import Path
from typing import Dict, List
import json
import click
from .reader import read_file
from .chunker import chunk_text
from .embeddings import embed_texts
from .metadata_llm import extract_metadata
@click.command()
@click.option('--input-dir', type=click.Path(exists=True, file_okay=False), required=True, help='Directory with docs.')
@click.option('--output', type=click.Path(), required=True, help='JSONL output path.')
@click.option('--vertical', default='general', help='Vertical tag.')
@click.option('--language', default='es', help='Language tag.')
def main(input_dir: str, output: str, vertical: str, language: str):
records: List[Dict] = []
for path in Path(input_dir).iterdir():
if not path.is_file():
continue
meta, body = read_file(path)
merged_meta = {
'vertical': vertical,
'language': language,
'source': path.name,
**meta
}
chunks = chunk_text(body)
embeddings = embed_texts(chunks)
for i, (chunk, vec) in enumerate(zip(chunks, embeddings), 1):
meta_llm = extract_metadata(chunk)
rec = {
'id': f"{path.stem}-chunk-{i:04d}",
'vector': vec,
'metadata': {
**merged_meta,
'chunk_index': i,
**meta_llm # summary, topics, vertical, language
}
}
records.append(rec)
out_path = Path(output)
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open('w', encoding='utf-8') as f:
for r in records:
json.dump(r, f, ensure_ascii=False)
f.write('\n')
click.echo(f"Wrote {len(records)} records → {out_path}")
if __name__ == '__main__':
main()