Marcos Morales
commited on
Commit
·
c69ce57
1
Parent(s):
dd58f3d
modified: app.py
Browse files
app.py
CHANGED
|
@@ -1,26 +1,27 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import
|
|
|
|
| 3 |
from pathlib import Path
|
| 4 |
from src.reader import read_file
|
| 5 |
from src.chunker import chunk_text
|
| 6 |
from src.embeddings import embed_texts
|
| 7 |
|
| 8 |
def run_pipeline(files, vertical, language):
|
| 9 |
-
|
| 10 |
for file_path in files:
|
| 11 |
meta, body = read_file(Path(file_path))
|
| 12 |
base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
|
| 13 |
chunks = chunk_text(body)
|
| 14 |
-
|
| 15 |
-
for i, (chunk, vec) in enumerate(zip(chunks,
|
| 16 |
-
|
| 17 |
"id": f"{Path(file_path).stem}-chunk-{i:04d}",
|
| 18 |
"vector": vec,
|
| 19 |
"metadata": {**base_meta, "chunk_index": i}
|
| 20 |
})
|
| 21 |
out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
|
| 22 |
with open(out_path, "w", encoding="utf-8") as f:
|
| 23 |
-
for r in
|
| 24 |
json.dump(r, f, ensure_ascii=False)
|
| 25 |
f.write("\n")
|
| 26 |
return out_path
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import json
|
| 3 |
+
import uuid
|
| 4 |
from pathlib import Path
|
| 5 |
from src.reader import read_file
|
| 6 |
from src.chunker import chunk_text
|
| 7 |
from src.embeddings import embed_texts
|
| 8 |
|
| 9 |
def run_pipeline(files, vertical, language):
|
| 10 |
+
records = []
|
| 11 |
for file_path in files:
|
| 12 |
meta, body = read_file(Path(file_path))
|
| 13 |
base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
|
| 14 |
chunks = chunk_text(body)
|
| 15 |
+
vectors = embed_texts(chunks)
|
| 16 |
+
for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1):
|
| 17 |
+
records.append({
|
| 18 |
"id": f"{Path(file_path).stem}-chunk-{i:04d}",
|
| 19 |
"vector": vec,
|
| 20 |
"metadata": {**base_meta, "chunk_index": i}
|
| 21 |
})
|
| 22 |
out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
|
| 23 |
with open(out_path, "w", encoding="utf-8") as f:
|
| 24 |
+
for r in records:
|
| 25 |
json.dump(r, f, ensure_ascii=False)
|
| 26 |
f.write("\n")
|
| 27 |
return out_path
|