File size: 1,632 Bytes
1670a80
c69ce57
 
4d8597b
dd58f3d
 
 
9710d1d
4d8597b
 
c69ce57
92e06e0
dd58f3d
 
 
c69ce57
 
9710d1d
c69ce57
dd58f3d
 
9710d1d
dd58f3d
 
 
c69ce57
dd58f3d
4d8597b
dd58f3d
4d8597b
dd58f3d
 
4d8597b
dd58f3d
 
4d8597b
dd58f3d
 
 
4d8597b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import gradio as gr
import json
import uuid
from pathlib import Path
from src.reader import read_file
from src.chunker import chunk_text
from src.embeddings import embed_texts
from src.metadata_llm import extract_metadata

def run_pipeline(files, vertical, language):
    records = []
    for file_path in files:
        meta, body = read_file(Path(file_path))
        base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
        chunks = chunk_text(body)
        vectors = embed_texts(chunks)
        for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1):
            meta_llm = extract_metadata(chunk)
            records.append({
                "id": f"{Path(file_path).stem}-chunk-{i:04d}",
                "vector": vec,
                "metadata": {**base_meta, "chunk_index": i, **meta_llm}
            })
    out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
    with open(out_path, "w", encoding="utf-8") as f:
        for r in records:
            json.dump(r, f, ensure_ascii=False)
            f.write("\n")
    return out_path

with gr.Blocks() as demo:
    gr.Markdown("## Ingesta para Amazon S3 Vector Features")
    with gr.Row():
        uploader = gr.File(label="Sube documentos", file_count="multiple", type="filepath")
        vertical = gr.Textbox(label="Vertical", value="general")
        language = gr.Textbox(label="Idioma", value="es")
    btn = gr.Button("Procesar y generar JSONL")
    outfile = gr.File(label="Descarga JSONL")
    btn.click(run_pipeline, inputs=[uploader, vertical, language], outputs=outfile)

if __name__ == "__main__":
    demo.launch()