Spaces:

Merlintxu
/

Chunkings

Sleeping

File size: 1,632 Bytes

1670a80
c69ce57
 
4d8597b
dd58f3d
 
 
9710d1d
4d8597b
 
c69ce57
92e06e0
dd58f3d
 
 
c69ce57
 
9710d1d
c69ce57
dd58f3d
 
9710d1d
dd58f3d
 
 
c69ce57
dd58f3d
4d8597b
dd58f3d
4d8597b
dd58f3d
 
4d8597b
dd58f3d
 
4d8597b
dd58f3d
 
 
4d8597b

import gradio as gr
import json
import uuid
from pathlib import Path
from src.reader import read_file
from src.chunker import chunk_text
from src.embeddings import embed_texts
from src.metadata_llm import extract_metadata

def run_pipeline(files, vertical, language):
    records = []
    for file_path in files:
        meta, body = read_file(Path(file_path))
        base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
        chunks = chunk_text(body)
        vectors = embed_texts(chunks)
        for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1):
            meta_llm = extract_metadata(chunk)
            records.append({
                "id": f"{Path(file_path).stem}-chunk-{i:04d}",
                "vector": vec,
                "metadata": {**base_meta, "chunk_index": i, **meta_llm}
            })
    out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
    with open(out_path, "w", encoding="utf-8") as f:
        for r in records:
            json.dump(r, f, ensure_ascii=False)
            f.write("\n")
    return out_path

with gr.Blocks() as demo:
    gr.Markdown("## Ingesta para Amazon S3 Vector Features")
    with gr.Row():
        uploader = gr.File(label="Sube documentos", file_count="multiple", type="filepath")
        vertical = gr.Textbox(label="Vertical", value="general")
        language = gr.Textbox(label="Idioma", value="es")
    btn = gr.Button("Procesar y generar JSONL")
    outfile = gr.File(label="Descarga JSONL")
    btn.click(run_pipeline, inputs=[uploader, vertical, language], outputs=outfile)

if __name__ == "__main__":
    demo.launch()