"""CLI entry‑point: read files → chunk → embed → enrich metadata → JSONL.""" from pathlib import Path from typing import Dict, List import json import click from .reader import read_file from .chunker import chunk_text from .embeddings import embed_texts from .metadata_llm import extract_metadata @click.command() @click.option('--input-dir', type=click.Path(exists=True, file_okay=False), required=True, help='Directory with docs.') @click.option('--output', type=click.Path(), required=True, help='JSONL output path.') @click.option('--vertical', default='general', help='Vertical tag.') @click.option('--language', default='es', help='Language tag.') def main(input_dir: str, output: str, vertical: str, language: str): records: List[Dict] = [] for path in Path(input_dir).iterdir(): if not path.is_file(): continue meta, body = read_file(path) merged_meta = { 'vertical': vertical, 'language': language, 'source': path.name, **meta } chunks = chunk_text(body) embeddings = embed_texts(chunks) for i, (chunk, vec) in enumerate(zip(chunks, embeddings), 1): meta_llm = extract_metadata(chunk) rec = { 'id': f"{path.stem}-chunk-{i:04d}", 'vector': vec, 'metadata': { **merged_meta, 'chunk_index': i, **meta_llm # summary, topics, vertical, language } } records.append(rec) out_path = Path(output) out_path.parent.mkdir(parents=True, exist_ok=True) with out_path.open('w', encoding='utf-8') as f: for r in records: json.dump(r, f, ensure_ascii=False) f.write('\n') click.echo(f"Wrote {len(records)} records → {out_path}") if __name__ == '__main__': main()