|
import gradio as gr |
|
import json |
|
import uuid |
|
from pathlib import Path |
|
from src.reader import read_file |
|
from src.chunker import chunk_text |
|
from src.embeddings import embed_texts |
|
from src.metadata_llm import extract_metadata |
|
|
|
def run_pipeline(files, vertical, language): |
|
records = [] |
|
for file_path in files: |
|
meta, body = read_file(Path(file_path)) |
|
base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta} |
|
chunks = chunk_text(body) |
|
vectors = embed_texts(chunks) |
|
for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1): |
|
meta_llm = extract_metadata(chunk) |
|
records.append({ |
|
"id": f"{Path(file_path).stem}-chunk-{i:04d}", |
|
"vector": vec, |
|
"metadata": {**base_meta, "chunk_index": i, **meta_llm} |
|
}) |
|
out_path = f"/tmp/{uuid.uuid4().hex}.jsonl" |
|
with open(out_path, "w", encoding="utf-8") as f: |
|
for r in records: |
|
json.dump(r, f, ensure_ascii=False) |
|
f.write("\n") |
|
return out_path |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## Ingesta para Amazon S3 Vector Features") |
|
with gr.Row(): |
|
uploader = gr.File(label="Sube documentos", file_count="multiple", type="filepath") |
|
vertical = gr.Textbox(label="Vertical", value="general") |
|
language = gr.Textbox(label="Idioma", value="es") |
|
btn = gr.Button("Procesar y generar JSONL") |
|
outfile = gr.File(label="Descarga JSONL") |
|
btn.click(run_pipeline, inputs=[uploader, vertical, language], outputs=outfile) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|