File size: 1,632 Bytes
1670a80 c69ce57 4d8597b dd58f3d 9710d1d 4d8597b c69ce57 92e06e0 dd58f3d c69ce57 9710d1d c69ce57 dd58f3d 9710d1d dd58f3d c69ce57 dd58f3d 4d8597b dd58f3d 4d8597b dd58f3d 4d8597b dd58f3d 4d8597b dd58f3d 4d8597b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import gradio as gr
import json
import uuid
from pathlib import Path
from src.reader import read_file
from src.chunker import chunk_text
from src.embeddings import embed_texts
from src.metadata_llm import extract_metadata
def run_pipeline(files, vertical, language):
records = []
for file_path in files:
meta, body = read_file(Path(file_path))
base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
chunks = chunk_text(body)
vectors = embed_texts(chunks)
for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1):
meta_llm = extract_metadata(chunk)
records.append({
"id": f"{Path(file_path).stem}-chunk-{i:04d}",
"vector": vec,
"metadata": {**base_meta, "chunk_index": i, **meta_llm}
})
out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
with open(out_path, "w", encoding="utf-8") as f:
for r in records:
json.dump(r, f, ensure_ascii=False)
f.write("\n")
return out_path
with gr.Blocks() as demo:
gr.Markdown("## Ingesta para Amazon S3 Vector Features")
with gr.Row():
uploader = gr.File(label="Sube documentos", file_count="multiple", type="filepath")
vertical = gr.Textbox(label="Vertical", value="general")
language = gr.Textbox(label="Idioma", value="es")
btn = gr.Button("Procesar y generar JSONL")
outfile = gr.File(label="Descarga JSONL")
btn.click(run_pipeline, inputs=[uploader, vertical, language], outputs=outfile)
if __name__ == "__main__":
demo.launch()
|