import gradio as gr import json import uuid from pathlib import Path from src.reader import read_file from src.chunker import chunk_text from src.embeddings import embed_texts from src.metadata_llm import extract_metadata def run_pipeline(files, vertical, language): records = [] for file_path in files: meta, body = read_file(Path(file_path)) base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta} chunks = chunk_text(body) vectors = embed_texts(chunks) for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1): meta_llm = extract_metadata(chunk) records.append({ "id": f"{Path(file_path).stem}-chunk-{i:04d}", "vector": vec, "metadata": {**base_meta, "chunk_index": i, **meta_llm} }) out_path = f"/tmp/{uuid.uuid4().hex}.jsonl" with open(out_path, "w", encoding="utf-8") as f: for r in records: json.dump(r, f, ensure_ascii=False) f.write("\n") return out_path with gr.Blocks() as demo: gr.Markdown("## Ingesta para Amazon S3 Vector Features") with gr.Row(): uploader = gr.File(label="Sube documentos", file_count="multiple", type="filepath") vertical = gr.Textbox(label="Vertical", value="general") language = gr.Textbox(label="Idioma", value="es") btn = gr.Button("Procesar y generar JSONL") outfile = gr.File(label="Descarga JSONL") btn.click(run_pipeline, inputs=[uploader, vertical, language], outputs=outfile) if __name__ == "__main__": demo.launch()