|
"""CLI entry‑point: read files → chunk → embed → enrich metadata → JSONL.""" |
|
from pathlib import Path |
|
from typing import Dict, List |
|
import json |
|
import click |
|
from .reader import read_file |
|
from .chunker import chunk_text |
|
from .embeddings import embed_texts |
|
from .metadata_llm import extract_metadata |
|
|
|
@click.command() |
|
@click.option('--input-dir', type=click.Path(exists=True, file_okay=False), required=True, help='Directory with docs.') |
|
@click.option('--output', type=click.Path(), required=True, help='JSONL output path.') |
|
@click.option('--vertical', default='general', help='Vertical tag.') |
|
@click.option('--language', default='es', help='Language tag.') |
|
def main(input_dir: str, output: str, vertical: str, language: str): |
|
records: List[Dict] = [] |
|
for path in Path(input_dir).iterdir(): |
|
if not path.is_file(): |
|
continue |
|
meta, body = read_file(path) |
|
merged_meta = { |
|
'vertical': vertical, |
|
'language': language, |
|
'source': path.name, |
|
**meta |
|
} |
|
chunks = chunk_text(body) |
|
embeddings = embed_texts(chunks) |
|
for i, (chunk, vec) in enumerate(zip(chunks, embeddings), 1): |
|
meta_llm = extract_metadata(chunk) |
|
rec = { |
|
'id': f"{path.stem}-chunk-{i:04d}", |
|
'vector': vec, |
|
'metadata': { |
|
**merged_meta, |
|
'chunk_index': i, |
|
**meta_llm |
|
} |
|
} |
|
records.append(rec) |
|
|
|
out_path = Path(output) |
|
out_path.parent.mkdir(parents=True, exist_ok=True) |
|
with out_path.open('w', encoding='utf-8') as f: |
|
for r in records: |
|
json.dump(r, f, ensure_ascii=False) |
|
f.write('\n') |
|
click.echo(f"Wrote {len(records)} records → {out_path}") |
|
|
|
if __name__ == '__main__': |
|
main() |
|
|